From 6f2b1c8dea1b14cb75d0a657efb975f76fd36d92 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Edgar=20Ram=C3=ADrez=20Mondrag=C3=B3n?= <16805946+edgarrmondragon@users.noreply.github.com> Date: Wed, 6 Nov 2024 19:24:56 -0600 Subject: [PATCH] refactor: Use `SQLConnector.jsonschema_to_sql` to map JSON schema types to SQL data types (#469) Related: - https://github.com/meltano/sdk/pull/2732 --- target_postgres/connector.py | 109 ++++++++++++++++++++--------------- 1 file changed, 64 insertions(+), 45 deletions(-) diff --git a/target_postgres/connector.py b/target_postgres/connector.py index 6ba09b9e..28713e8e 100644 --- a/target_postgres/connector.py +++ b/target_postgres/connector.py @@ -17,7 +17,7 @@ import simplejson import sqlalchemy as sa from singer_sdk import SQLConnector -from singer_sdk import typing as th +from singer_sdk.connectors.sql import JSONSchemaToSQL from sqlalchemy.dialects.postgresql import ARRAY, BIGINT, BYTEA, JSONB, UUID from sqlalchemy.engine import URL from sqlalchemy.engine.url import make_url @@ -30,7 +30,6 @@ TEXT, TIME, TIMESTAMP, - VARCHAR, TypeDecorator, ) from sshtunnel import SSHTunnelForwarder @@ -39,6 +38,22 @@ from singer_sdk.connectors.sql import FullyQualifiedName +class JSONSchemaToPostgres(JSONSchemaToSQL): + """Convert JSON Schema types to Postgres types.""" + + def __init__(self, *, content_encoding: bool = True) -> None: + """Initialize the JSONSchemaToPostgres instance.""" + super().__init__() + self.content_encoding = content_encoding + + def handle_raw_string(self, schema): + """Handle a raw string type.""" + if self.content_encoding and schema.get("contentEncoding") == "base16": + return HexByteString() + + return TEXT() + + class PostgresConnector(SQLConnector): """Sets up SQL Alchemy, and other Postgres related stuff.""" @@ -214,7 +229,50 @@ def clone_table( new_table.create(bind=connection) return new_table - def to_sql_type(self, jsonschema_type: dict) -> sa.types.TypeEngine: # type: ignore[override] + def _handle_array_type(self, jsonschema: dict) -> ARRAY | JSONB: + """Handle array type.""" + items = jsonschema.get("items") + # Case 1: items is a string + if isinstance(items, str): + return ARRAY(self.to_sql_type({"type": items})) + + # Case 2: items are more complex + if isinstance(items, dict): + # Case 2.1: items are variants + if "type" not in items: + return ARRAY(JSONB()) + + items_type = items["type"] + + # Case 2.2: items are a single type + if isinstance(items_type, str): + return ARRAY(self.to_sql_type({"type": items_type})) + + # Case 2.3: items are a list of types + if isinstance(items_type, list): + return ARRAY(self.to_sql_type({"type": items_type})) + + # Case 3: tuples + return ARRAY(JSONB()) if isinstance(items, list) else JSONB() + + @cached_property + def jsonschema_to_sql(self) -> JSONSchemaToSQL: + """Return a JSONSchemaToSQL instance with custom type handling.""" + to_sql = JSONSchemaToPostgres(content_encoding=self.interpret_content_encoding) + to_sql.fallback_type = TEXT + to_sql.register_type_handler("integer", BIGINT) + to_sql.register_type_handler("object", JSONB) + to_sql.register_type_handler("array", self._handle_array_type) + to_sql.register_format_handler("date-time", TIMESTAMP) + to_sql.register_format_handler("uuid", UUID) + to_sql.register_format_handler("email", TEXT) + to_sql.register_format_handler("uri", TEXT) + to_sql.register_format_handler("hostname", TEXT) + to_sql.register_format_handler("ipv4", TEXT) + to_sql.register_format_handler("ipv6", TEXT) + return to_sql + + def to_sql_type(self, jsonschema_type: dict) -> sa.types.TypeEngine: """Return a JSON Schema representation of the provided type. By default will call `typing.to_sql_type()`. @@ -270,7 +328,7 @@ def to_sql_type(self, jsonschema_type: dict) -> sa.types.TypeEngine: # type: ig return PostgresConnector.pick_best_sql_type(sql_type_array=sql_type_array) - def pick_individual_type(self, jsonschema_type: dict): # noqa: PLR0911 + def pick_individual_type(self, jsonschema_type: dict): """Select the correct sql type assuming jsonschema_type has only a single type. Args: @@ -281,47 +339,8 @@ def pick_individual_type(self, jsonschema_type: dict): # noqa: PLR0911 """ if "null" in jsonschema_type["type"]: return None - if "integer" in jsonschema_type["type"]: - return BIGINT() - if "object" in jsonschema_type["type"]: - return JSONB() - if "array" in jsonschema_type["type"]: - items = jsonschema_type.get("items") - # Case 1: items is a string - if isinstance(items, str): - return ARRAY(self.to_sql_type({"type": items})) - - # Case 2: items are more complex - if isinstance(items, dict): - # Case 2.1: items are variants - if "type" not in items: - return ARRAY(JSONB()) - - items_type = items["type"] - - # Case 2.2: items are a single type - if isinstance(items_type, str): - return ARRAY(self.to_sql_type({"type": items_type})) - - # Case 2.3: items are a list of types - if isinstance(items_type, list): - return ARRAY(self.to_sql_type({"type": items_type})) - - # Case 3: tuples - return ARRAY(JSONB()) if isinstance(items, list) else JSONB() - - # string formats - if jsonschema_type.get("format") == "date-time": - return TIMESTAMP() - if jsonschema_type.get("format") == "uuid": - return UUID() - if ( - self.interpret_content_encoding - and jsonschema_type.get("contentEncoding") == "base16" - ): - return HexByteString() - individual_type = th.to_sql_type(jsonschema_type) - return TEXT() if isinstance(individual_type, VARCHAR) else individual_type + + return self.jsonschema_to_sql.to_sql_type(jsonschema_type) @staticmethod def pick_best_sql_type(sql_type_array: list):