diff --git a/crate_anon/anonymise/ddr.py b/crate_anon/anonymise/ddr.py
index edd1dfcc..42008579 100644
--- a/crate_anon/anonymise/ddr.py
+++ b/crate_anon/anonymise/ddr.py
@@ -42,6 +42,7 @@
ensure_valid_field_name,
ensure_valid_table_name,
is_sqltype_valid,
+ SQLTYPE_DATE,
)
from cardinal_pythonlib.sqlalchemy.dialect import SqlaDialectName
from cardinal_pythonlib.sqlalchemy.schema import (
@@ -78,7 +79,6 @@
is_sql_column_type_textual,
matches_fielddef,
matches_tabledef,
- SQLTYPE_DATE,
)
if TYPE_CHECKING:
diff --git a/crate_anon/common/sql.py b/crate_anon/common/sql.py
index bf1a1897..cba047b1 100644
--- a/crate_anon/common/sql.py
+++ b/crate_anon/common/sql.py
@@ -55,6 +55,13 @@
make_grammar,
mysql_grammar,
)
+from cardinal_pythonlib.sql.validation import (
+ SQLTYPES_INTEGER,
+ SQLTYPES_BIT,
+ SQLTYPES_FLOAT,
+ SQLTYPES_TEXT,
+ SQLTYPES_OTHER_NUMERIC,
+)
from cardinal_pythonlib.sqlalchemy.core_query import count_star
from cardinal_pythonlib.sqlalchemy.dialect import SqlaDialectName
from cardinal_pythonlib.sqlalchemy.schema import (
@@ -88,48 +95,17 @@
# Constants
# =============================================================================
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Generic
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
TIMING_COMMIT = "commit"
SQL_OPS_VALUE_UNNECESSARY = ["IS NULL", "IS NOT NULL"]
SQL_OPS_MULTIPLE_VALUES = ["IN", "NOT IN"]
-SQLTYPE_DATE = "DATE"
-
-SQLTYPES_INTEGER = [
- "INT",
- "INTEGER",
- "TINYINT",
- "SMALLINT",
- "MEDIUMINT",
- "BIGINT",
- "BIT",
- "BOOL",
- "BOOLEAN",
-]
-SQLTYPES_FLOAT = [
- "DOUBLE",
- "FLOAT",
- "DEC",
- "DECIMAL",
-]
-SQLTYPES_TEXT = [
- "CHAR",
- "VARCHAR",
- "NVARCHAR",
- "TINYTEXT",
- "TEXT",
- "NTEXT",
- "MEDIUMTEXT",
- "LONGTEXT",
-]
-SQLTYPES_WITH_DATE = [
- SQLTYPE_DATE,
- "DATETIME",
- "TIMESTAMP",
-]
-# SQLTYPES_BINARY = [
-# "BINARY", "BLOB", "IMAGE", "LONGBLOB", "VARBINARY",
-# ]
+SQLTYPES_INTEGER_OR_BIT = SQLTYPES_INTEGER + SQLTYPES_BIT
+SQLTYPES_FLOAT_OR_OTHER_NUMERIC = SQLTYPES_FLOAT + SQLTYPES_OTHER_NUMERIC
# Must match querybuilder.js:
QB_DATATYPE_INTEGER = "int"
@@ -147,15 +123,15 @@
# Dictionaries for the different dialects mapping text column type to length
# or default length.
# Doesn't include things like VARCHAR which require the user to specify length
-MYSQL_COLTYPE_TO_LEN = {
- # https://dev.mysql.com/doc/refman/8.0/en/string-type-overview.html
- "CHAR": 1, # can specify CHAR(0) to CHAR(255), but if omitted, length is 1
- "TINYTEXT": 255, # 2^8 - 1
- "TEXT": 65535, # 2^16 - 1
- "MEDIUMTEXT": 16777215, # 2^24 - 1
- "LONGTEXT": 4294967295, # 2^32 - 1
-}
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# SQLAlchemy dialects
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+DATABRICKS_COLTYPE_TO_LEN = {
+ # https://docs.databricks.com/en/sql/language-manual/data-types/string-type.html # noqa: E501
+ "STRING": None # There is no maximum.
+}
MSSQL_COLTYPE_TO_LEN = {
# The "N" prefix means Unicode.
# https://docs.microsoft.com/en-us/sql/t-sql/data-types/char-and-varchar-transact-sql?view=sql-server-ver15 # noqa: E501
@@ -168,16 +144,20 @@
"TEXT": 2**31 - 1,
"NTEXT": 2**30 - 1,
}
+MYSQL_COLTYPE_TO_LEN = {
+ # https://dev.mysql.com/doc/refman/8.0/en/string-type-overview.html
+ "CHAR": 1, # can specify CHAR(0) to CHAR(255), but if omitted, length is 1
+ "TINYTEXT": 255, # 2^8 - 1
+ "TEXT": 65535, # 2^16 - 1
+ "MEDIUMTEXT": 16777215, # 2^24 - 1
+ "LONGTEXT": 4294967295, # 2^32 - 1
+}
-
-# def combine_db_schema_table(db: Optional[str],
-# schema: Optional[str],
-# table: str) -> str:
-# # ANSI SQL: http://www.contrib.andrew.cmu.edu/~shadow/sql/sql1992.txt
-# #
,
-# if not table:
-# raise ValueError("Missing table supplied to combine_db_schema_table")
-# return ".".join(x for x in [db, schema, table] if x)
+DIALECT_TO_STRING_LEN_LOOKUP = {
+ SqlaDialectName.DATABRICKS: DATABRICKS_COLTYPE_TO_LEN,
+ SqlaDialectName.MSSQL: MSSQL_COLTYPE_TO_LEN,
+ SqlaDialectName.MYSQL: MYSQL_COLTYPE_TO_LEN,
+}
# =============================================================================
@@ -2314,7 +2294,7 @@ def coltype_length_if_text(column_type: str, dialect: str) -> Optional[int]:
Args:
column_type: SQL column type as a string, e.g. ``"VARCHAR(50)"``
- dialect: the sql dialect the column type is from
+ dialect: the SQL dialect the column type is from
Returns:
length of the column or ``None`` if it's not a text column.
@@ -2324,21 +2304,20 @@ def coltype_length_if_text(column_type: str, dialect: str) -> Optional[int]:
if column_type in SQLTYPES_TEXT:
# No length specified - get the default
try:
- if dialect == SqlaDialectName.MYSQL:
- return MYSQL_COLTYPE_TO_LEN[column_type]
- elif dialect == SqlaDialectName.MSSQL:
- return MSSQL_COLTYPE_TO_LEN[column_type]
- else:
- raise ValueError(
- f"{dialect} is not a valid SQL dialect. Must "
- f"be one of: {SqlaDialectName.MYSQL!r}, "
- f"{SqlaDialectName.MSSQL!r}"
- )
+ lookup = DIALECT_TO_STRING_LEN_LOOKUP[dialect]
except KeyError:
- log.error(
- f"SQL dialect {dialect} has no data type " f"{column_type}"
+ possible = list(DIALECT_TO_STRING_LEN_LOOKUP.keys())
+ raise ValueError(
+ f"CRATE doesn't properly understand SQL dialect {dialect!r}. "
+ f"Supported: {possible}"
+ )
+ try:
+ return lookup[column_type]
+ except KeyError:
+ raise ValueError(
+ f"For SQL dialect {dialect!r}, CRATE doesn't know the length "
+ f"for string data type {column_type!r}"
)
- raise
else:
# Length specified - get it from the column type
try:
@@ -2346,21 +2325,20 @@ def coltype_length_if_text(column_type: str, dialect: str) -> Optional[int]:
basetype = m.group(1)
length = m.group(2)
if length == "MAX" or length == "-1":
- if basetype == "VARCHAR":
- return MSSQL_COLTYPE_TO_LEN["VARCHAR_MAX"]
- elif basetype == "NVARCHAR":
- return MSSQL_COLTYPE_TO_LEN["NVARCHAR_MAX"]
- else:
- return None
+ if dialect == SqlaDialectName.MSSQL:
+ if basetype == "VARCHAR":
+ return MSSQL_COLTYPE_TO_LEN["VARCHAR_MAX"]
+ elif basetype == "NVARCHAR":
+ return MSSQL_COLTYPE_TO_LEN["NVARCHAR_MAX"]
+ return None
except AttributeError:
# Not the correct type of column
return None
try:
- length = int(length)
+ return int(length)
except ValueError:
# Not the correct type of column
return None
- return length
def escape_quote_in_literal(s: str) -> str:
diff --git a/crate_anon/crateweb/research/research_db_info.py b/crate_anon/crateweb/research/research_db_info.py
index c8ad275b..82e4ec00 100644
--- a/crate_anon/crateweb/research/research_db_info.py
+++ b/crate_anon/crateweb/research/research_db_info.py
@@ -41,6 +41,10 @@
from cardinal_pythonlib.logs import BraceStyleAdapter
from cardinal_pythonlib.reprfunc import auto_repr
from cardinal_pythonlib.sql.sql_grammar import SqlGrammar
+from cardinal_pythonlib.sql.validation import (
+ SQLTYPES_TEXT,
+ SQLTYPES_WITH_DATE,
+)
from cardinal_pythonlib.sqlalchemy.dialect import SqlaDialectName
from cardinal_pythonlib.sqlalchemy.schema import (
MSSQL_DEFAULT_SCHEMA,
@@ -56,21 +60,19 @@
from crate_anon.common.constants import RUNNING_WITHOUT_CONFIG
from crate_anon.common.sql import (
ColumnId,
- is_sql_column_type_textual,
- make_grammar,
QB_DATATYPE_DATE,
QB_DATATYPE_FLOAT,
QB_DATATYPE_INTEGER,
QB_DATATYPE_STRING,
QB_DATATYPE_STRING_FULLTEXT,
QB_DATATYPE_UNKNOWN,
+ SQLTYPES_FLOAT_OR_OTHER_NUMERIC,
+ SQLTYPES_INTEGER_OR_BIT,
SchemaId,
SqlArgsTupleType,
- SQLTYPES_FLOAT,
- SQLTYPES_WITH_DATE,
- SQLTYPES_TEXT,
- SQLTYPES_INTEGER,
TableId,
+ is_sql_column_type_textual,
+ make_grammar,
translate_sql_qmark_to_percent,
)
from crate_anon.crateweb.core.constants import SettingsKeys
@@ -151,7 +153,7 @@ def querybuilder_type(self) -> str:
defines our field type, like ``"int"`` or ``"date"``. See source.
"""
basetype = self.basetype
- if basetype in SQLTYPES_FLOAT:
+ if basetype in SQLTYPES_FLOAT_OR_OTHER_NUMERIC:
return QB_DATATYPE_FLOAT
if basetype in SQLTYPES_WITH_DATE:
return QB_DATATYPE_DATE
@@ -160,7 +162,7 @@ def querybuilder_type(self) -> str:
return QB_DATATYPE_STRING_FULLTEXT
else:
return QB_DATATYPE_STRING
- if basetype in SQLTYPES_INTEGER:
+ if basetype in SQLTYPES_INTEGER_OR_BIT:
return QB_DATATYPE_INTEGER
return QB_DATATYPE_UNKNOWN
diff --git a/crate_anon/preprocess/systmone_ddgen.py b/crate_anon/preprocess/systmone_ddgen.py
index 16b4dd05..710b6dfc 100644
--- a/crate_anon/preprocess/systmone_ddgen.py
+++ b/crate_anon/preprocess/systmone_ddgen.py
@@ -427,6 +427,7 @@
from cardinal_pythonlib.dicts import reversedict
from cardinal_pythonlib.enumlike import CaseInsensitiveEnumMeta
+from cardinal_pythonlib.sql.validation import SQLTYPE_DATE
from crate_anon.anonymise.altermethod import AlterMethod
from crate_anon.anonymise.constants import (
@@ -437,7 +438,6 @@
SrcFlag,
)
from crate_anon.common.logfunc import warn_once
-from crate_anon.common.sql import SQLTYPE_DATE
from crate_anon.anonymise.dd import DataDictionary, DataDictionaryRow
from crate_anon.preprocess.constants import CRATE_COL_PK
diff --git a/devnotes/2025_01_sqlalchemy2_databricks/notes_databricks.txt b/devnotes/2025_01_sqlalchemy2_databricks/notes_databricks.txt
new file mode 100644
index 00000000..f570b6e9
--- /dev/null
+++ b/devnotes/2025_01_sqlalchemy2_databricks/notes_databricks.txt
@@ -0,0 +1,40 @@
+===============================================================================
+CHANGES FOR DATABRICKS SUPPORT
+===============================================================================
+
+Databricks is a cloud-based data platform:
+https://en.wikipedia.org/wiki/Databricks.
+
+
+-------------------------------------------------------------------------------
+Can you run Databricks locally?
+-------------------------------------------------------------------------------
+
+Maybe not. This set of instructions about a Docker image look ultimately to be
+about creating a Docker image that you can push to an existing Databricks
+cluster and run there:
+
+https://medium.com/@d.v.rademaker/do-it-yourself-building-your-own-databricks-docker-container-9cd670612927
+
+
+-------------------------------------------------------------------------------
+Databricks for SQLAlchemy
+-------------------------------------------------------------------------------
+
+Tutorial for Azure Databricks:
+
+ https://learn.microsoft.com/en-us/azure/databricks/dev-tools/sqlalchemy
+
+... has a broken link to its example sqlalchemy.py
+
+The basic SQL connector for Databricks is:
+
+ https://github.com/databricks/databricks-sql-python
+
+The SQLAlchemy dialect is:
+
+ https://github.com/databricks/databricks-sqlalchemy
+
+To import it:
+
+ from databricks.sqlalchemy import DatabricksDialect
diff --git a/devnotes/2025_01_sqlalchemy2.txt b/devnotes/2025_01_sqlalchemy2_databricks/notes_sqlalchemy2.txt
similarity index 99%
rename from devnotes/2025_01_sqlalchemy2.txt
rename to devnotes/2025_01_sqlalchemy2_databricks/notes_sqlalchemy2.txt
index b0f2c842..009bee31 100644
--- a/devnotes/2025_01_sqlalchemy2.txt
+++ b/devnotes/2025_01_sqlalchemy2_databricks/notes_sqlalchemy2.txt
@@ -133,6 +133,7 @@ Migration to 2.0 Step Three - Resolve all RemovedIn20Warnings
- pytest
... remember pytest -k; pytest --log-cli-level
- crate_make_demo_database
+ - crate_anon_draft_dd
- crate_anonymise --full
- crate_anonymise --incremental
- crate_nlp --nlpdef crate_biomarkers --full
diff --git a/devnotes/2025_01_sqlalchemy2_databricks/pipeline_test.sh b/devnotes/2025_01_sqlalchemy2_databricks/pipeline_test.sh
new file mode 100755
index 00000000..fcca045b
--- /dev/null
+++ b/devnotes/2025_01_sqlalchemy2_databricks/pipeline_test.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+set -ex
+
+if [ -z "$TMP_CRATE_DEMO_DATABASE_URL" ]; then
+ echo "Please set environment variable TMP_CRATE_DEMO_DATABASE_URL first."
+ exit 1
+fi
+
+crate_make_demo_database "${TMP_CRATE_DEMO_DATABASE_URL}"
+crate_anon_draft_dd
+crate_anonymise --full
+crate_anonymise --incremental
+crate_nlp --nlpdef crate_biomarkers --full
+crate_nlp --nlpdef crate_biomarkers --incremental
+crate_researcher_report ~/Downloads/tmp_crate_researcher_report.pdf
+
+pytest # Do this last: warnings (which may be OK) cause exit code failure.
diff --git a/setup.py b/setup.py
index 257119fb..21e4c4b4 100755
--- a/setup.py
+++ b/setup.py
@@ -69,7 +69,8 @@
"appdirs==1.4.4", # where to store some temporary data
"arrow==0.15.7", # [pin exact version from cardinal_pythonlib]
"beautifulsoup4==4.9.1", # [pin exact version from cardinal_pythonlib]
- "cardinal_pythonlib==2.0.0", # RNC libraries
+ # "cardinal_pythonlib==2.0.0", # RNC libraries
+ "cardinal_pythonlib @ git+https://github.com/RudolfCardinal/pythonlib@sqlalchemy2#egg=cardinal_pythonlib-2.0.0-rc1", # RNC libraries, development version # noqa: E501
"cairosvg==2.7.0", # work with SVG files
"celery==5.2.7", # back-end scheduling
"chardet==3.0.4", # character encoding detection for cardinal_pythonlib