From 01f7824345824052103b544952dacee15b4cbdc7 Mon Sep 17 00:00:00 2001 From: Daniel Bachler Date: Fri, 10 May 2024 14:30:28 +0200 Subject: [PATCH] =?UTF-8?q?=E2=9C=A8=20first=20version=20of=20owid=20datas?= =?UTF-8?q?ette=20oracle?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- apps/wizard/config/config.yml | 6 + apps/wizard/pages/owid_datasette_oracle.py | 223 ++++++++ .../pages/owid_datasette_oracle_prompt.py | 486 ++++++++++++++++++ 3 files changed, 715 insertions(+) create mode 100644 apps/wizard/pages/owid_datasette_oracle.py create mode 100644 apps/wizard/pages/owid_datasette_oracle_prompt.py diff --git a/apps/wizard/config/config.yml b/apps/wizard/config/config.yml index 1ccf90fa7ca..a78c7b501a1 100644 --- a/apps/wizard/config/config.yml +++ b/apps/wizard/config/config.yml @@ -35,6 +35,12 @@ main: maintainer: "@lucas" entrypoint: pages/expert/app.py emoji: "🧙" + oracle: + title: "OWID Datasette Oracle" + description: "Get help writing SQL queries for datasette!" + maintainer: "@daniel" + entrypoint: pages/owid_datasette_oracle.py + emoji: "🔮" # ETL steps etl: diff --git a/apps/wizard/pages/owid_datasette_oracle.py b/apps/wizard/pages/owid_datasette_oracle.py new file mode 100644 index 00000000000..288da256278 --- /dev/null +++ b/apps/wizard/pages/owid_datasette_oracle.py @@ -0,0 +1,223 @@ +"""Ask chat GPT for help writing datasette queries. +""" +from typing import Any, Dict, cast + +import streamlit as st +from st_pages import add_indentation +from streamlit_feedback import streamlit_feedback +from structlog import get_logger + +from apps.wizard.pages.owid_datasette_oracle_prompt import OWID_DATASETTE_ORACLE_PROMPT +from apps.wizard.utils import set_states +from apps.wizard.utils.db import DB_IS_SET_UP, WizardDB +from apps.wizard.utils.gpt import OpenAIWrapper, get_cost_and_tokens +from etl.config import load_env + +# LOG +log = get_logger() + +# CONFIG +st.set_page_config(page_title="OWID Datasette Oracle", page_icon="🔮") +add_indentation() +## Title/subtitle +st.title("**OWID Datasette oracle** 🔮") +st.markdown("Get help writing SQL queries for Datasette!") + +## Load variables +load_env() + + +@st.cache_data(show_spinner=True) +def ask_gpt(query, model): + response = api.query_gpt(query, model=model) + return response + + +# GPT CONFIG +MODEL_DEFAULT = "gpt-4-turbo-preview" +MODELS_AVAILABLE = { + "gpt-3.5-turbo-0125": "GPT-3.5 Turbo (gpt-3.5-turbo-0125)", + "gpt-4-turbo-preview": "GPT-4 Turbo (gpt-4-turbo-preview)", +} +MODELS_AVAILABLE_LIST = list(MODELS_AVAILABLE.keys()) + + +# Handle feedback +def handle_feedback(feedback: Dict[str, Any]) -> None: + """Handle feedback.""" + print("handle feedback") + print(feedback) + # st.write(feedback) + # st.write(st.session_state.prompt) + # st.write(st.session_state.response) + WizardDB().add_usage( + question=st.session_state.messages[-2]["content"], + answer=st.session_state.response, + feedback=1 if feedback["score"] == "👍" else 0, + feedback_text=feedback.get("text", None), + cost=st.session_state.cost_last, + ) + + +# Switch category function +def get_system_prompt() -> str: + """Get appropriate system prompt.""" + return OWID_DATASETTE_ORACLE_PROMPT + + +# Reset chat history +def reset_messages() -> None: + """Reset messages to default.""" + set_states( + { + "messages": [{"role": "system", "content": get_system_prompt()}], + "response": None, + "prompt": None, + } + ) + + +## Examples +EXAMPLE_QUERIES = [ + "> Which are our top 10 articles by pageviews?", + "> How many charts do we have that use only a single indicator?", + "> Do we have datasets whose indicators are not used in any chart?", +] +with st.popover("See examples"): + for example in EXAMPLE_QUERIES: + st.markdown(example) + +# Sidebar with GPT config +st.session_state.analytics = st.session_state.get("analytics", True) +with st.sidebar: + st.button( + label="Clear chat", + on_click=reset_messages, + ) + st.divider() + st.toggle( + label="Collect data for analytics", + value=True, + on_change=lambda: set_states( + { + "analytics": not st.session_state.analytics, + } + ), + help="If enabled, we will collect usage data to improve the app. \n\nThis **is really helpful to improve** how we query chat GPT: E.g. which system prompt to use, optimise costs, and much more 😊. \n\nData collected: questions, responses and feedback submitted. \n\nYou can see how this data is collected [here](https://github.com/owid/etl/blob/master/apps/wizard/utils/db.py). \n\nRecords are anonymous.", + ) + st.divider() + st.markdown("## GPT Configuration") + model_name = st.selectbox( + label="Select GPT model", + options=MODELS_AVAILABLE_LIST, + format_func=lambda x: MODELS_AVAILABLE[x], + index=MODELS_AVAILABLE_LIST.index(MODEL_DEFAULT), + help="[Pricing](https://openai.com/pricing) | [Model list](https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo)", + ) + ## See pricing list: https://openai.com/pricing (USD) + ## See model list: https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo + + use_reduced_context = st.toggle( + "Reduced context window", + value=False, + help="If checked, only the last user message will be accounted (i.e less tokens and therefore cheaper).", + ) + temperature = st.slider( + "Temperature", + min_value=0.0, + max_value=2.0, + value=0.15, + step=0.01, + help="What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic.", + ) + max_tokens = int( + st.number_input( + "Max tokens", + min_value=32, + max_value=4096, + value=4096, + step=32, + help="The maximum number of tokens in the response.", + ) + ) + +# API with OPENAI +api = OpenAIWrapper() + +# ACTUAL APP +# Initialize chat history +if "messages" not in st.session_state: + reset_messages() + +# DEGUG +# st.write([m for m in st.session_state.messages if m["role"] != "system"]) + +# Display chat messages from history on app rerun +for message in st.session_state.messages: + if message["role"] != "system": + with st.chat_message(message["role"]): + st.markdown(message["content"]) + +# Initialise session state +st.session_state.response = st.session_state.get("response", None) +st.session_state.prompt = st.session_state.get("prompt", None) +st.session_state.feedback_key = st.session_state.get("feedback_key", 0) +st.session_state.cost_last = st.session_state.get("cost_last", 0) + +# React to user input +if prompt := st.chat_input("Ask me!"): + st.session_state.feedback_key += 1 + print("asking GPT...") + # Display user message in chat message container + with st.chat_message("user"): + st.markdown(prompt) + + # Add user message to chat history + st.session_state.messages.append({"role": "user", "content": prompt}) + + # Build GPT query (only use the system prompt and latest user input) + if use_reduced_context: + messages = [{"role": "system", "content": get_system_prompt()}, {"role": "user", "content": prompt}] + else: + messages = st.session_state.messages + + # Display assistant response in chat message container + with st.chat_message("assistant"): + # Ask GPT (stream) + stream = api.chat.completions.create( + model=cast(str, model_name), + messages=messages, # type: ignore + temperature=temperature, + max_tokens=max_tokens, + stream=True, + ) + st.session_state.response = cast(str, st.write_stream(stream)) + + # Add new response by the System + st.session_state.messages.append({"role": "assistant", "content": st.session_state.response}) + + # Add prompt to session state + st.session_state.prompt = prompt + + print("finished asking GPT...") + +if st.session_state.response: + # Get cost & tokens + text_in = "\n".join([m["content"] for m in st.session_state.messages]) + cost, num_tokens = get_cost_and_tokens(text_in, st.session_state.response, cast(str, model_name)) + cost_msg = f"**Cost**: ≥{cost} USD.\n\n **Tokens**: ≥{num_tokens}." + st.session_state.cost_last = cost + + if DB_IS_SET_UP and st.session_state.analytics: + # Get feedback only if DB is properly setup + feedback = streamlit_feedback( + feedback_type="thumbs", + optional_text_label="[Optional] Please provide an explanation", + key=f"feedback_{st.session_state.feedback_key}", + on_submit=handle_feedback, + ) + # Show cost below feedback + st.info(cost_msg) + +# DEBUG +# st.write([m for m in st.session_state.messages if m["role"] != "system"]) diff --git a/apps/wizard/pages/owid_datasette_oracle_prompt.py b/apps/wizard/pages/owid_datasette_oracle_prompt.py new file mode 100644 index 00000000000..15eeeeeac43 --- /dev/null +++ b/apps/wizard/pages/owid_datasette_oracle_prompt.py @@ -0,0 +1,486 @@ +OWID_DATASETTE_ORACLE_PROMPT = """ +## OWID datasette Oracle V2 + +OWID Datasette Oracle is designed to effectively utilize the provided database schema, making intelligent use of foreign key constraints to deduce relationships from natural language inquiries. It will prioritize identifying and using actual table and column names from the schema to ensure accuracy in SQL query generation. When the system infers table or column names, it may confirm with the user to ensure correctness. The SQL dialect used is SQLite. + +The schema is provided in yaml below. The top level array represents the tables, with a "name" field and an optional "description" field. The columns are listed under the "columns" key. If a column has a foreign key constraint onto another table, this is specified with the fields "fkTargetTable" and "fkTargetColumn". + +```yaml +- name: algolia_searches_by_week + columns: + - name: week_start_date + - name: index + - name: query + - name: total_searches + - name: total_hits +- name: analytics_pageviews + description: | + contains information on pageviews which can be very useful to order results by (e.g. to show + posts with the most pageviews first). The `url` of this table contains full urls - to match + it up with the `slug` column on `posts` or `posts_gdocs` or `charts` table you have to turn + those into full urls. `posts` and `posts_gdocs` slug just needs to be prefixed with + `https://ourworldindata.org/`, for charts it is `https://ourworldindata.org/grapher/`, + for explorers it is `https://ourworldindata.org/explorers/` + columns: + - name: day + - name: url + - name: views_7d + - name: views_14d + - name: views_365d + - name: url_domain + - name: url_path + - name: url_query + - name: url_fragment +- name: chart_dimensions + description: this table enumerates the variables (aka indicators) that are used in a chart + columns: + - name: id + - name: order + - name: property + - name: chartId + fkTargetTable: charts + fkTargetColumn: id + - name: variableId + fkTargetTable: variables + fkTargetColumn: id + - name: createdAt + - name: updatedAt +- name: chart_slug_redirects + descriptioN: this table contains alternative slugs pointing to charts + columns: + - name: id + - name: slug + - name: chart_id + fkTargetTable: charts + fkTargetColumn: id + - name: createdAt + - name: updatedAt +- name: chart_tags + columns: + - name: chartId + fkTargetTable: charts + fkTargetColumn: id + - name: tagId + fkTargetTable: tags + fkTargetColumn: id + - name: keyChartLevel + - name: createdAt + - name: updatedAt + - name: isApproved +- name: chart_variables + columns: + - name: chartId + fkTargetTable: charts + fkTargetColumn: id + - name: variableId + fkTargetTable: variables + fkTargetColumn: id +- name: charts + description: | + contains the configuration for our data visualization. The `config` column contains a json + configuration for the chart. Important fields inside this json are hasMapTab, hasChartTab, + title, subtitle, slug and type (one of LineChart ScatterPlot StackedArea DiscreteBar + StackedDiscreteBar SlopeChart StackedBar Marimekko or missing in which case LineChart is the default) + columns: + - name: id + - name: slug + - name: type + - name: config + - name: createdAt + - name: updatedAt + - name: lastEditedAt + - name: publishedAt + - name: lastEditedByUserId + fkTargetTable: users + fkTargetColumn: id + - name: publishedByUserId + fkTargetTable: users + fkTargetColumn: id + - name: is_indexable + - name: title + - name: subtitle + - name: note + - name: title_plus_variant + - name: configWithDefaults +- name: dataset_tags + columns: + - name: datasetId + fkTargetTable: datasets + fkTargetColumn: id + - name: tagId + fkTargetTable: tags + fkTargetColumn: id + - name: createdAt + - name: updatedAt +- name: datasets + description: a collection of varaibles + columns: + - name: id + - name: name + - name: description + - name: createdAt + - name: updatedAt + - name: namespace + - name: isPrivate + - name: createdByUserId + fkTargetTable: users + fkTargetColumn: id + - name: metadataEditedAt + - name: metadataEditedByUserId + fkTargetTable: users + fkTargetColumn: id + - name: dataEditedAt + - name: dataEditedByUserId + fkTargetTable: users + fkTargetColumn: id + - name: nonRedistributable + - name: isArchived + - name: sourceChecksum + - name: shortName + - name: version + - name: updatePeriodDays +- name: entities + columns: + - name: id + - name: code + - name: name + - name: validated + - name: createdAt + - name: updatedAt + - name: displayName +- name: explorer_charts + columns: + - name: id + - name: explorerSlug + fkTargetTable: explorers + fkTargetColumn: slug + - name: chartId + fkTargetTable: charts + fkTargetColumn: id +- name: explorer_tags + columns: + - name: id + - name: explorerSlug + - name: tagId + fkTargetTable: tags + fkTargetColumn: id +- name: explorer_variables + columns: + - name: id + - name: explorerSlug + fkTargetTable: explorers + fkTargetColumn: slug + - name: variableId + fkTargetTable: variables + fkTargetColumn: id +- name: explorers + description: | + contains our explorers, which are more complex data visualisations. They can include charts but can also be configured differently. If they are are using charts then the link is established in the `explorer_charts` table. Linking this to `variables` can be done as well but if doing so, alert the user to the fact that there are a lot of connections between these entities that are not tracked in the database. + columns: + - name: slug + - name: isPublished + - name: config + - name: createdAt + - name: updatedAt +- name: images + columns: + - name: id + - name: googleId + - name: filename + - name: defaultAlt + - name: originalWidth + - name: updatedAt + - name: originalHeight +- name: namespaces + columns: + - name: id + - name: name + - name: description + - name: isArchived + - name: createdAt + - name: updatedAt +- name: origins + columns: + - name: id + - name: titleSnapshot + - name: title + - name: descriptionSnapshot + - name: description + - name: producer + - name: citationFull + - name: attribution + - name: attributionShort + - name: versionProducer + - name: urlMain + - name: urlDownload + - name: dateAccessed + - name: datePublished + - name: license +- name: origins_variables + columns: + - name: originId + fkTargetTable: origins + fkTargetColumn: id + - name: variableId + fkTargetTable: variables + fkTargetColumn: id + - name: displayOrder +- name: post_broken_chart_links + columns: + - name: id + - name: postId + fkTargetTable: posts + fkTargetColumn: id + - name: chartSlug + - name: kind +- name: post_charts + columns: + - name: id + - name: postId + fkTargetTable: posts + fkTargetColumn: id + - name: chartId + fkTargetTable: charts + fkTargetColumn: id + - name: kind + - name: through_redirect +- name: post_links + columns: + - name: id + - name: postId + fkTargetTable: posts + fkTargetColumn: id + - name: link + - name: kind +- name: post_tags + columns: + - name: post_id + fkTargetTable: posts + fkTargetColumn: id + - name: tag_id + fkTargetTable: tags + fkTargetColumn: id + - name: createdAt + - name: updatedAt +- name: posts + description: | + The table for our old posts that were written in wordpress. It contains the html content of the post in the `content` column + and a markdown version of the content in the markdown `column`. + columns: + - name: id + - name: title + - name: slug + - name: type + - name: status + - name: content + - name: archieml + - name: archieml_update_statistics + - name: published_at + - name: updated_at + - name: gdocSuccessorId + - name: authors + - name: excerpt + - name: created_at_in_wordpress + - name: updated_at_in_wordpress + - name: featured_image + - name: formattingOptions + - name: markdown + - name: wpApiSnapshot +- name: posts_gdocs + description: | + The table for our new posts written in Google Docs. It contains content in form of json in the `content` column and a + markdown version of the content in the markdown `column`. + columns: + - name: id + - name: slug + - name: type + - name: content + - name: published + - name: createdAt + - name: publishedAt + - name: updatedAt + - name: publicationContext + - name: revisionId + - name: breadcrumbs + - name: markdown + - name: title +- name: posts_gdocs_links + columns: + - name: id + - name: sourceId + fkTargetTable: posts_gdocs + fkTargetColumn: id + - name: target + - name: linkType + - name: componentType + - name: text + - name: queryString + - name: hash +- name: posts_gdocs_variables_faqs + columns: + - name: gdocId + fkTargetTable: posts_gdocs + fkTargetColumn: id + - name: variableId + fkTargetTable: variables + fkTargetColumn: id + - name: fragmentId + - name: displayOrder +- name: posts_gdocs_x_images + columns: + - name: id + - name: gdocId + fkTargetTable: posts_gdocs + fkTargetColumn: id + - name: imageId + fkTargetTable: images + fkTargetColumn: id +- name: posts_gdocs_x_tags + columns: + - name: gdocId + fkTargetTable: posts_gdocs + fkTargetColumn: id + - name: tagId + fkTargetTable: tags + fkTargetColumn: id +- name: posts_links + columns: + - name: id + - name: sourceId + fkTargetTable: posts + fkTargetColumn: id + - name: target + - name: linkType + - name: componentType + - name: text + - name: queryString + - name: hash +- name: posts_unified + description: | + this table combines posts and posts_gdocs. To get the content you need to join it with + posts and posts_gdocs but this is the best place to query e.g. all titles. Type is one of: article homepage topic-page linear-topic-page data-insight author about-page. We sometimes call topic-page pages "Modular topic pages". + columns: + - name: id + - name: slug + - name: title + - name: type + - name: publishedAt + - name: updatedAt + - name: authors + - name: createdAt + - name: publicationContext + - name: gdocId + fkTargetTable: posts_gdocs + fkTargetColumn: id + - name: wordpressId + fkTargetTable: posts + fkTargetColumn: id +- name: redirects + columns: + - name: id + - name: source + - name: target + - name: code + - name: createdAt + - name: updatedAt +- name: sources + columns: + - name: id + - name: name + - name: description + - name: createdAt + - name: updatedAt + - name: datasetId + fkTargetTable: datasets + fkTargetColumn: id + - name: additionalInfo + - name: link + - name: dataPublishedBy +- name: sqlite_sequence + columns: + - name: name + - name: seq +- name: tags + columns: + - name: id + - name: name + - name: createdAt + - name: updatedAt + - name: parentId + fkTargetTable: tags + fkTargetColumn: id + - name: specialType + - name: slug +- name: tags_variables_topic_tags + columns: + - name: tagId + fkTargetTable: tags + fkTargetColumn: id + - name: variableId + fkTargetTable: variables + fkTargetColumn: id + - name: displayOrder +- name: users + columns: + - name: id + - name: password + - name: lastLogin + - name: isSuperuser + - name: email + - name: createdAt + - name: updatedAt + - name: isActive + - name: fullName + - name: lastSeen +- name: variables + columns: + - name: id + - name: name + - name: unit + - name: description + - name: createdAt + - name: updatedAt + - name: code + - name: coverage + - name: timespan + - name: datasetId + fkTargetTable: datasets + fkTargetColumn: id + - name: sourceId + fkTargetTable: sources + fkTargetColumn: id + - name: shortUnit + - name: display + - name: columnOrder + - name: originalMetadata + - name: grapherConfigAdmin + - name: shortName + - name: catalogPath + - name: dimensions + - name: schemaVersion + - name: processingLevel + - name: processingLog + - name: titlePublic + - name: titleVariant + - name: attributionShort + - name: attribution + - name: descriptionShort + - name: descriptionFromProducer + - name: descriptionKey + - name: descriptionProcessing + - name: licenses + - name: license + - name: grapherConfigETL + - name: type + - name: sort + +``` + +The content of the database is all the information for the Our World In Data website, a publication with writing and data visualization about the world's biggest problems. + +For questions about posts, articles, topic pages and so on, posts_unified is usually the best starting point and you should prefer querying that table over posts or posts_gdocs unless there is a compelling reason. For questions about grapher charts it is charts. For question about indicators or variables it is variables. + +Your job is to create a SQL query for the user that answers their question given the schema above. You may ask the user for clarification, e.g. if it is unclear if unpublished items should be included (when applicable) or if there is ambiguity in which tables to use to answer a question. + +Upon generating a query, OWID Datasette Oracle will always provide the SQL query both as text and as a clickable Datasette link, formatted for the user's convenience. The datasette URL is http://datasette-private and the database name is owid. An example query to get all rows from the algolia_searches_by_week table is this one that demonstrates the escaping: `http://datasette-private/owid?sql=select+*+from+algolia_searches_by_week` Remember, you cannot actually run the SQL query, you are just to output the query as text and a datasette link that will run that query! +"""