diff --git a/crate_anon/anonymise/ddr.py b/crate_anon/anonymise/ddr.py index edd1dfcc..42008579 100644 --- a/crate_anon/anonymise/ddr.py +++ b/crate_anon/anonymise/ddr.py @@ -42,6 +42,7 @@ ensure_valid_field_name, ensure_valid_table_name, is_sqltype_valid, + SQLTYPE_DATE, ) from cardinal_pythonlib.sqlalchemy.dialect import SqlaDialectName from cardinal_pythonlib.sqlalchemy.schema import ( @@ -78,7 +79,6 @@ is_sql_column_type_textual, matches_fielddef, matches_tabledef, - SQLTYPE_DATE, ) if TYPE_CHECKING: diff --git a/crate_anon/common/sql.py b/crate_anon/common/sql.py index bf1a1897..cba047b1 100644 --- a/crate_anon/common/sql.py +++ b/crate_anon/common/sql.py @@ -55,6 +55,13 @@ make_grammar, mysql_grammar, ) +from cardinal_pythonlib.sql.validation import ( + SQLTYPES_INTEGER, + SQLTYPES_BIT, + SQLTYPES_FLOAT, + SQLTYPES_TEXT, + SQLTYPES_OTHER_NUMERIC, +) from cardinal_pythonlib.sqlalchemy.core_query import count_star from cardinal_pythonlib.sqlalchemy.dialect import SqlaDialectName from cardinal_pythonlib.sqlalchemy.schema import ( @@ -88,48 +95,17 @@ # Constants # ============================================================================= +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# Generic +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + TIMING_COMMIT = "commit" SQL_OPS_VALUE_UNNECESSARY = ["IS NULL", "IS NOT NULL"] SQL_OPS_MULTIPLE_VALUES = ["IN", "NOT IN"] -SQLTYPE_DATE = "DATE" - -SQLTYPES_INTEGER = [ - "INT", - "INTEGER", - "TINYINT", - "SMALLINT", - "MEDIUMINT", - "BIGINT", - "BIT", - "BOOL", - "BOOLEAN", -] -SQLTYPES_FLOAT = [ - "DOUBLE", - "FLOAT", - "DEC", - "DECIMAL", -] -SQLTYPES_TEXT = [ - "CHAR", - "VARCHAR", - "NVARCHAR", - "TINYTEXT", - "TEXT", - "NTEXT", - "MEDIUMTEXT", - "LONGTEXT", -] -SQLTYPES_WITH_DATE = [ - SQLTYPE_DATE, - "DATETIME", - "TIMESTAMP", -] -# SQLTYPES_BINARY = [ -# "BINARY", "BLOB", "IMAGE", "LONGBLOB", "VARBINARY", -# ] +SQLTYPES_INTEGER_OR_BIT = SQLTYPES_INTEGER + SQLTYPES_BIT +SQLTYPES_FLOAT_OR_OTHER_NUMERIC = SQLTYPES_FLOAT + SQLTYPES_OTHER_NUMERIC # Must match querybuilder.js: QB_DATATYPE_INTEGER = "int" @@ -147,15 +123,15 @@ # Dictionaries for the different dialects mapping text column type to length # or default length. # Doesn't include things like VARCHAR which require the user to specify length -MYSQL_COLTYPE_TO_LEN = { - # https://dev.mysql.com/doc/refman/8.0/en/string-type-overview.html - "CHAR": 1, # can specify CHAR(0) to CHAR(255), but if omitted, length is 1 - "TINYTEXT": 255, # 2^8 - 1 - "TEXT": 65535, # 2^16 - 1 - "MEDIUMTEXT": 16777215, # 2^24 - 1 - "LONGTEXT": 4294967295, # 2^32 - 1 -} +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# SQLAlchemy dialects +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +DATABRICKS_COLTYPE_TO_LEN = { + # https://docs.databricks.com/en/sql/language-manual/data-types/string-type.html # noqa: E501 + "STRING": None # There is no maximum. +} MSSQL_COLTYPE_TO_LEN = { # The "N" prefix means Unicode. # https://docs.microsoft.com/en-us/sql/t-sql/data-types/char-and-varchar-transact-sql?view=sql-server-ver15 # noqa: E501 @@ -168,16 +144,20 @@ "TEXT": 2**31 - 1, "NTEXT": 2**30 - 1, } +MYSQL_COLTYPE_TO_LEN = { + # https://dev.mysql.com/doc/refman/8.0/en/string-type-overview.html + "CHAR": 1, # can specify CHAR(0) to CHAR(255), but if omitted, length is 1 + "TINYTEXT": 255, # 2^8 - 1 + "TEXT": 65535, # 2^16 - 1 + "MEDIUMTEXT": 16777215, # 2^24 - 1 + "LONGTEXT": 4294967295, # 2^32 - 1 +} - -# def combine_db_schema_table(db: Optional[str], -# schema: Optional[str], -# table: str) -> str: -# # ANSI SQL: http://www.contrib.andrew.cmu.edu/~shadow/sql/sql1992.txt -# # , -# if not table: -# raise ValueError("Missing table supplied to combine_db_schema_table") -# return ".".join(x for x in [db, schema, table] if x) +DIALECT_TO_STRING_LEN_LOOKUP = { + SqlaDialectName.DATABRICKS: DATABRICKS_COLTYPE_TO_LEN, + SqlaDialectName.MSSQL: MSSQL_COLTYPE_TO_LEN, + SqlaDialectName.MYSQL: MYSQL_COLTYPE_TO_LEN, +} # ============================================================================= @@ -2314,7 +2294,7 @@ def coltype_length_if_text(column_type: str, dialect: str) -> Optional[int]: Args: column_type: SQL column type as a string, e.g. ``"VARCHAR(50)"`` - dialect: the sql dialect the column type is from + dialect: the SQL dialect the column type is from Returns: length of the column or ``None`` if it's not a text column. @@ -2324,21 +2304,20 @@ def coltype_length_if_text(column_type: str, dialect: str) -> Optional[int]: if column_type in SQLTYPES_TEXT: # No length specified - get the default try: - if dialect == SqlaDialectName.MYSQL: - return MYSQL_COLTYPE_TO_LEN[column_type] - elif dialect == SqlaDialectName.MSSQL: - return MSSQL_COLTYPE_TO_LEN[column_type] - else: - raise ValueError( - f"{dialect} is not a valid SQL dialect. Must " - f"be one of: {SqlaDialectName.MYSQL!r}, " - f"{SqlaDialectName.MSSQL!r}" - ) + lookup = DIALECT_TO_STRING_LEN_LOOKUP[dialect] except KeyError: - log.error( - f"SQL dialect {dialect} has no data type " f"{column_type}" + possible = list(DIALECT_TO_STRING_LEN_LOOKUP.keys()) + raise ValueError( + f"CRATE doesn't properly understand SQL dialect {dialect!r}. " + f"Supported: {possible}" + ) + try: + return lookup[column_type] + except KeyError: + raise ValueError( + f"For SQL dialect {dialect!r}, CRATE doesn't know the length " + f"for string data type {column_type!r}" ) - raise else: # Length specified - get it from the column type try: @@ -2346,21 +2325,20 @@ def coltype_length_if_text(column_type: str, dialect: str) -> Optional[int]: basetype = m.group(1) length = m.group(2) if length == "MAX" or length == "-1": - if basetype == "VARCHAR": - return MSSQL_COLTYPE_TO_LEN["VARCHAR_MAX"] - elif basetype == "NVARCHAR": - return MSSQL_COLTYPE_TO_LEN["NVARCHAR_MAX"] - else: - return None + if dialect == SqlaDialectName.MSSQL: + if basetype == "VARCHAR": + return MSSQL_COLTYPE_TO_LEN["VARCHAR_MAX"] + elif basetype == "NVARCHAR": + return MSSQL_COLTYPE_TO_LEN["NVARCHAR_MAX"] + return None except AttributeError: # Not the correct type of column return None try: - length = int(length) + return int(length) except ValueError: # Not the correct type of column return None - return length def escape_quote_in_literal(s: str) -> str: diff --git a/crate_anon/crateweb/research/research_db_info.py b/crate_anon/crateweb/research/research_db_info.py index c8ad275b..82e4ec00 100644 --- a/crate_anon/crateweb/research/research_db_info.py +++ b/crate_anon/crateweb/research/research_db_info.py @@ -41,6 +41,10 @@ from cardinal_pythonlib.logs import BraceStyleAdapter from cardinal_pythonlib.reprfunc import auto_repr from cardinal_pythonlib.sql.sql_grammar import SqlGrammar +from cardinal_pythonlib.sql.validation import ( + SQLTYPES_TEXT, + SQLTYPES_WITH_DATE, +) from cardinal_pythonlib.sqlalchemy.dialect import SqlaDialectName from cardinal_pythonlib.sqlalchemy.schema import ( MSSQL_DEFAULT_SCHEMA, @@ -56,21 +60,19 @@ from crate_anon.common.constants import RUNNING_WITHOUT_CONFIG from crate_anon.common.sql import ( ColumnId, - is_sql_column_type_textual, - make_grammar, QB_DATATYPE_DATE, QB_DATATYPE_FLOAT, QB_DATATYPE_INTEGER, QB_DATATYPE_STRING, QB_DATATYPE_STRING_FULLTEXT, QB_DATATYPE_UNKNOWN, + SQLTYPES_FLOAT_OR_OTHER_NUMERIC, + SQLTYPES_INTEGER_OR_BIT, SchemaId, SqlArgsTupleType, - SQLTYPES_FLOAT, - SQLTYPES_WITH_DATE, - SQLTYPES_TEXT, - SQLTYPES_INTEGER, TableId, + is_sql_column_type_textual, + make_grammar, translate_sql_qmark_to_percent, ) from crate_anon.crateweb.core.constants import SettingsKeys @@ -151,7 +153,7 @@ def querybuilder_type(self) -> str: defines our field type, like ``"int"`` or ``"date"``. See source. """ basetype = self.basetype - if basetype in SQLTYPES_FLOAT: + if basetype in SQLTYPES_FLOAT_OR_OTHER_NUMERIC: return QB_DATATYPE_FLOAT if basetype in SQLTYPES_WITH_DATE: return QB_DATATYPE_DATE @@ -160,7 +162,7 @@ def querybuilder_type(self) -> str: return QB_DATATYPE_STRING_FULLTEXT else: return QB_DATATYPE_STRING - if basetype in SQLTYPES_INTEGER: + if basetype in SQLTYPES_INTEGER_OR_BIT: return QB_DATATYPE_INTEGER return QB_DATATYPE_UNKNOWN diff --git a/crate_anon/preprocess/systmone_ddgen.py b/crate_anon/preprocess/systmone_ddgen.py index 16b4dd05..710b6dfc 100644 --- a/crate_anon/preprocess/systmone_ddgen.py +++ b/crate_anon/preprocess/systmone_ddgen.py @@ -427,6 +427,7 @@ from cardinal_pythonlib.dicts import reversedict from cardinal_pythonlib.enumlike import CaseInsensitiveEnumMeta +from cardinal_pythonlib.sql.validation import SQLTYPE_DATE from crate_anon.anonymise.altermethod import AlterMethod from crate_anon.anonymise.constants import ( @@ -437,7 +438,6 @@ SrcFlag, ) from crate_anon.common.logfunc import warn_once -from crate_anon.common.sql import SQLTYPE_DATE from crate_anon.anonymise.dd import DataDictionary, DataDictionaryRow from crate_anon.preprocess.constants import CRATE_COL_PK diff --git a/devnotes/2025_01_sqlalchemy2_databricks/notes_databricks.txt b/devnotes/2025_01_sqlalchemy2_databricks/notes_databricks.txt new file mode 100644 index 00000000..f570b6e9 --- /dev/null +++ b/devnotes/2025_01_sqlalchemy2_databricks/notes_databricks.txt @@ -0,0 +1,40 @@ +=============================================================================== +CHANGES FOR DATABRICKS SUPPORT +=============================================================================== + +Databricks is a cloud-based data platform: +https://en.wikipedia.org/wiki/Databricks. + + +------------------------------------------------------------------------------- +Can you run Databricks locally? +------------------------------------------------------------------------------- + +Maybe not. This set of instructions about a Docker image look ultimately to be +about creating a Docker image that you can push to an existing Databricks +cluster and run there: + +https://medium.com/@d.v.rademaker/do-it-yourself-building-your-own-databricks-docker-container-9cd670612927 + + +------------------------------------------------------------------------------- +Databricks for SQLAlchemy +------------------------------------------------------------------------------- + +Tutorial for Azure Databricks: + + https://learn.microsoft.com/en-us/azure/databricks/dev-tools/sqlalchemy + +... has a broken link to its example sqlalchemy.py + +The basic SQL connector for Databricks is: + + https://github.com/databricks/databricks-sql-python + +The SQLAlchemy dialect is: + + https://github.com/databricks/databricks-sqlalchemy + +To import it: + + from databricks.sqlalchemy import DatabricksDialect diff --git a/devnotes/2025_01_sqlalchemy2.txt b/devnotes/2025_01_sqlalchemy2_databricks/notes_sqlalchemy2.txt similarity index 99% rename from devnotes/2025_01_sqlalchemy2.txt rename to devnotes/2025_01_sqlalchemy2_databricks/notes_sqlalchemy2.txt index b0f2c842..009bee31 100644 --- a/devnotes/2025_01_sqlalchemy2.txt +++ b/devnotes/2025_01_sqlalchemy2_databricks/notes_sqlalchemy2.txt @@ -133,6 +133,7 @@ Migration to 2.0 Step Three - Resolve all RemovedIn20Warnings - pytest ... remember pytest -k; pytest --log-cli-level - crate_make_demo_database + - crate_anon_draft_dd - crate_anonymise --full - crate_anonymise --incremental - crate_nlp --nlpdef crate_biomarkers --full diff --git a/devnotes/2025_01_sqlalchemy2_databricks/pipeline_test.sh b/devnotes/2025_01_sqlalchemy2_databricks/pipeline_test.sh new file mode 100755 index 00000000..fcca045b --- /dev/null +++ b/devnotes/2025_01_sqlalchemy2_databricks/pipeline_test.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +set -ex + +if [ -z "$TMP_CRATE_DEMO_DATABASE_URL" ]; then + echo "Please set environment variable TMP_CRATE_DEMO_DATABASE_URL first." + exit 1 +fi + +crate_make_demo_database "${TMP_CRATE_DEMO_DATABASE_URL}" +crate_anon_draft_dd +crate_anonymise --full +crate_anonymise --incremental +crate_nlp --nlpdef crate_biomarkers --full +crate_nlp --nlpdef crate_biomarkers --incremental +crate_researcher_report ~/Downloads/tmp_crate_researcher_report.pdf + +pytest # Do this last: warnings (which may be OK) cause exit code failure. diff --git a/setup.py b/setup.py index 257119fb..21e4c4b4 100755 --- a/setup.py +++ b/setup.py @@ -69,7 +69,8 @@ "appdirs==1.4.4", # where to store some temporary data "arrow==0.15.7", # [pin exact version from cardinal_pythonlib] "beautifulsoup4==4.9.1", # [pin exact version from cardinal_pythonlib] - "cardinal_pythonlib==2.0.0", # RNC libraries + # "cardinal_pythonlib==2.0.0", # RNC libraries + "cardinal_pythonlib @ git+https://github.com/RudolfCardinal/pythonlib@sqlalchemy2#egg=cardinal_pythonlib-2.0.0-rc1", # RNC libraries, development version # noqa: E501 "cairosvg==2.7.0", # work with SVG files "celery==5.2.7", # back-end scheduling "chardet==3.0.4", # character encoding detection for cardinal_pythonlib