From 70506556531e32167b7a327a88aa0eabcfd46603 Mon Sep 17 00:00:00 2001 From: Juan Carlos Jose Camacho Date: Tue, 23 Apr 2024 10:47:39 -0600 Subject: [PATCH] Add documentation for multi-schemas --- README.md | 25 ++++++++++++++++++++++--- dataherald/tests/test_api.py | 20 -------------------- docs/api.create_database_connection.rst | 25 ++++++++++++++++++++++++- docs/api.get_table_description.rst | 1 + docs/api.list_database_connections.rst | 2 ++ docs/api.list_table_description.rst | 1 + docs/api.refresh_table_description.rst | 1 + docs/api.scan_table_description.rst | 8 ++++---- 8 files changed, 55 insertions(+), 28 deletions(-) diff --git a/README.md b/README.md index fcfdd1aa..c2a4b768 100644 --- a/README.md +++ b/README.md @@ -178,6 +178,24 @@ curl -X 'POST' \ }' ``` +##### Connecting multi-schemas +You can connect many schemas using one db connection if you want to create SQL joins between schemas. +Currently only `BigQuery`, `Snowflake`, `Databricks` and `Postgres` support this feature. +To use multi-schemas instead of sending the `schema` in the `connection_uri` set it in the `schemas` param, like this: + +``` +curl -X 'POST' \ + '/api/v1/database-connections' \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "alias": "my_db_alias", + "use_ssh": false, + "connection_uri": snowflake://:@-/", + "schemas": ["schema_1", "schema_2", ...] +}' +``` + ##### Connecting to supported Data warehouses and using SSH You can find the details on how to connect to the supported data warehouses in the [docs](https://dataherald.readthedocs.io/en/latest/api.create_database_connection.html) @@ -194,7 +212,8 @@ While only the Database scan part is required to start generating SQL, adding ve #### Scanning the Database The database scan is used to gather information about the database including table and column names and identifying low cardinality columns and their values to be stored in the context store and used in the prompts to the LLM. In addition, it retrieves logs, which consist of historical queries associated with each database table. These records are then stored within the query_history collection. The historical queries retrieved encompass data from the past three months and are grouped based on query and user. -db_connection_id is the id of the database connection you want to scan, which is returned when you create a database connection. +The db_connection_id param is the id of the database connection you want to scan, which is returned when you create a database connection. +The ids param is the table_description_id that you want to scan. You can trigger a scan of a database from the `POST /api/v1/table-descriptions/sync-schemas` endpoint. Example below @@ -205,11 +224,11 @@ curl -X 'POST' \ -H 'Content-Type: application/json' \ -d '{ "db_connection_id": "db_connection_id", - "table_names": ["table_name"] + "ids": ["", "", ...] }' ``` -Since the endpoint identifies low cardinality columns (and their values) it can take time to complete. Therefore while it is possible to trigger a scan on the entire DB by not specifying the `table_names`, we recommend against it for large databases. +Since the endpoint identifies low cardinality columns (and their values) it can take time to complete. #### Get logs per db connection Once a database was scanned you can use this endpoint to retrieve the tables logs diff --git a/dataherald/tests/test_api.py b/dataherald/tests/test_api.py index 5d086bc9..f8817b54 100644 --- a/dataherald/tests/test_api.py +++ b/dataherald/tests/test_api.py @@ -13,23 +13,3 @@ def test_heartbeat(): response = client.get("/api/v1/heartbeat") assert response.status_code == HTTP_200_CODE - -def test_scan_all_tables(): - response = client.post( - "/api/v1/table-descriptions/sync-schemas", - json={"db_connection_id": "64dfa0e103f5134086f7090c"}, - ) - assert response.status_code == HTTP_201_CODE - - -def test_scan_one_table(): - try: - client.post( - "/api/v1/table-descriptions/sync-schemas", - json={ - "db_connection_id": "64dfa0e103f5134086f7090c", - "table_names": ["foo"], - }, - ) - except ValueError as e: - assert str(e) == "No table found" diff --git a/docs/api.create_database_connection.rst b/docs/api.create_database_connection.rst index 60e7ba77..c2fbf724 100644 --- a/docs/api.create_database_connection.rst +++ b/docs/api.create_database_connection.rst @@ -26,6 +26,9 @@ Once the database connection is established, it retrieves the table names and cr "alias": "string", "use_ssh": false, "connection_uri": "string", + "schemas": [ + "string" + ], "path_to_credentials_file": "string", "llm_api_key": "string", "ssh_settings": { @@ -189,7 +192,7 @@ Connections to supported Data warehouses ----------------------------------------- The format of the ``connection_uri`` parameter in the API call will depend on the data warehouse type you are connecting to. -You can find samples and how to generate them :ref:. +You can find samples and how to generate them below. Postgres ^^^^^^^^^^^^ @@ -324,3 +327,23 @@ Example:: "connection_uri": bigquery://v2-real-estate/K2 +**Connecting multi-schemas** + +You can connect many schemas using one db connection if you want to create SQL joins between schemas. +Currently only `BigQuery`, `Snowflake`, `Databricks` and `Postgres` support this feature. +To use multi-schemas instead of sending the `schema` in the `connection_uri` set it in the `schemas` param, like this: + +**Example** + +.. code-block:: rst + + curl -X 'POST' \ + '/api/v1/database-connections' \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "alias": "my_db_alias_identifier", + "use_ssh": false, + "connection_uri": "snowflake://:@-/", + "schemas": ["foo", "bar"] + }' diff --git a/docs/api.get_table_description.rst b/docs/api.get_table_description.rst index 330e89c7..76b0f8dc 100644 --- a/docs/api.get_table_description.rst +++ b/docs/api.get_table_description.rst @@ -24,6 +24,7 @@ HTTP 200 code response "table_schema": "string", "status": "NOT_SCANNED | SYNCHRONIZING | DEPRECATED | SCANNED | FAILED" "error_message": "string", + "table_schema": "string", "columns": [ { "name": "string", diff --git a/docs/api.list_database_connections.rst b/docs/api.list_database_connections.rst index 1396ee23..d688d912 100644 --- a/docs/api.list_database_connections.rst +++ b/docs/api.list_database_connections.rst @@ -21,6 +21,7 @@ HTTP 200 code response "dialect": "databricks", "use_ssh": false, "connection_uri": "foooAABk91Q4wjoR2h07GR7_72BdQnxi8Rm6i_EjyS-mzz_o2c3RAWaEqnlUvkK5eGD5kUfE5xheyivl1Wfbk_EM7CgV4SvdLmOOt7FJV-3kG4zAbar=", + "schemas": null, "path_to_credentials_file": null, "llm_api_key": null, "ssh_settings": null @@ -31,6 +32,7 @@ HTTP 200 code response "dialect": "postgres", "use_ssh": true, "connection_uri": null, + "schemas": null, "path_to_credentials_file": "bar-LWxPdFcjQw9lU7CeK_2ELR3jGBq0G_uQ7E2rfPLk2RcFR4aDO9e2HmeAQtVpdvtrsQ_0zjsy9q7asdsadXExYJ0g==", "llm_api_key": "gAAAAABlCz5TeU0ym4hW3bf9u21dz7B9tlnttOGLRDt8gq2ykkblNvpp70ZjT9FeFcoyMv-Csvp3GNQfw66eYvQBrcBEPsLokkLO2Jc2DD-Q8Aw6g_8UahdOTxJdT4izA6MsiQrf7GGmYBGZqbqsjTdNmcq661wF9Q==", "ssh_settings": { diff --git a/docs/api.list_table_description.rst b/docs/api.list_table_description.rst index e257805d..9bff1318 100644 --- a/docs/api.list_table_description.rst +++ b/docs/api.list_table_description.rst @@ -33,6 +33,7 @@ HTTP 200 code response "table_schema": "string", "status": "NOT_SCANNED | SYNCHRONIZING | DEPRECATED | SCANNED | FAILED" "error_message": "string", + "table_schema": "string", "columns": [ { "name": "string", diff --git a/docs/api.refresh_table_description.rst b/docs/api.refresh_table_description.rst index 74c600c1..6e392e79 100644 --- a/docs/api.refresh_table_description.rst +++ b/docs/api.refresh_table_description.rst @@ -34,6 +34,7 @@ HTTP 201 code response "table_schema": "string", "status": "NOT_SCANNED | SYNCHRONIZING | DEPRECATED | SCANNED | FAILED" "error_message": "string", + "table_schema": "string", "columns": [ { "name": "string", diff --git a/docs/api.scan_table_description.rst b/docs/api.scan_table_description.rst index 1ccc9f8e..55488cc1 100644 --- a/docs/api.scan_table_description.rst +++ b/docs/api.scan_table_description.rst @@ -9,7 +9,7 @@ which consist of historical queries associated with each database table. These r query_history collection. The historical queries retrieved encompass data from the past three months and are grouped based on query and user. -It can scan all db tables or if you specify a `table_names` then It will only scan those tables. +The `ids` param is used to set the table description ids that you want to scan. The process is carried out through Background Tasks, ensuring that even if it operates slowly, taking several minutes, the HTTP response remains swift. @@ -23,7 +23,7 @@ Request this ``POST`` endpoint:: { "db_connection_id": "string", - "table_names": ["string"] # Optional + "ids": ["string"] } **Responses** @@ -36,7 +36,6 @@ HTTP 201 code response **Request example** -To scan all the tables in a db don't specify a `table_names` .. code-block:: rst @@ -45,5 +44,6 @@ To scan all the tables in a db don't specify a `table_names` -H 'accept: application/json' \ -H 'Content-Type: application/json' \ -d '{ - "db_connection_id": "db_connection_id" + "db_connection_id": "db_connection_id", + "ids": ["14e52c5f7d6dc4bc510d6d27", "15e52c5f7d6dc4bc510d6d34"] }'