Skip to content

Commit

Permalink
Databricks support
Browse files Browse the repository at this point in the history
  • Loading branch information
RudolfCardinal committed Jan 8, 2025
1 parent 2ebf42a commit c10aaa9
Show file tree
Hide file tree
Showing 8 changed files with 127 additions and 87 deletions.
2 changes: 1 addition & 1 deletion crate_anon/anonymise/ddr.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
ensure_valid_field_name,
ensure_valid_table_name,
is_sqltype_valid,
SQLTYPE_DATE,
)
from cardinal_pythonlib.sqlalchemy.dialect import SqlaDialectName
from cardinal_pythonlib.sqlalchemy.schema import (
Expand Down Expand Up @@ -78,7 +79,6 @@
is_sql_column_type_textual,
matches_fielddef,
matches_tabledef,
SQLTYPE_DATE,
)

if TYPE_CHECKING:
Expand Down
130 changes: 54 additions & 76 deletions crate_anon/common/sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,13 @@
make_grammar,
mysql_grammar,
)
from cardinal_pythonlib.sql.validation import (
SQLTYPES_INTEGER,
SQLTYPES_BIT,
SQLTYPES_FLOAT,
SQLTYPES_TEXT,
SQLTYPES_OTHER_NUMERIC,
)
from cardinal_pythonlib.sqlalchemy.core_query import count_star
from cardinal_pythonlib.sqlalchemy.dialect import SqlaDialectName
from cardinal_pythonlib.sqlalchemy.schema import (
Expand Down Expand Up @@ -88,48 +95,17 @@
# Constants
# =============================================================================

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Generic
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

TIMING_COMMIT = "commit"

SQL_OPS_VALUE_UNNECESSARY = ["IS NULL", "IS NOT NULL"]
SQL_OPS_MULTIPLE_VALUES = ["IN", "NOT IN"]

SQLTYPE_DATE = "DATE"

SQLTYPES_INTEGER = [
"INT",
"INTEGER",
"TINYINT",
"SMALLINT",
"MEDIUMINT",
"BIGINT",
"BIT",
"BOOL",
"BOOLEAN",
]
SQLTYPES_FLOAT = [
"DOUBLE",
"FLOAT",
"DEC",
"DECIMAL",
]
SQLTYPES_TEXT = [
"CHAR",
"VARCHAR",
"NVARCHAR",
"TINYTEXT",
"TEXT",
"NTEXT",
"MEDIUMTEXT",
"LONGTEXT",
]
SQLTYPES_WITH_DATE = [
SQLTYPE_DATE,
"DATETIME",
"TIMESTAMP",
]
# SQLTYPES_BINARY = [
# "BINARY", "BLOB", "IMAGE", "LONGBLOB", "VARBINARY",
# ]
SQLTYPES_INTEGER_OR_BIT = SQLTYPES_INTEGER + SQLTYPES_BIT
SQLTYPES_FLOAT_OR_OTHER_NUMERIC = SQLTYPES_FLOAT + SQLTYPES_OTHER_NUMERIC

# Must match querybuilder.js:
QB_DATATYPE_INTEGER = "int"
Expand All @@ -147,15 +123,15 @@
# Dictionaries for the different dialects mapping text column type to length
# or default length.
# Doesn't include things like VARCHAR which require the user to specify length
MYSQL_COLTYPE_TO_LEN = {
# https://dev.mysql.com/doc/refman/8.0/en/string-type-overview.html
"CHAR": 1, # can specify CHAR(0) to CHAR(255), but if omitted, length is 1
"TINYTEXT": 255, # 2^8 - 1
"TEXT": 65535, # 2^16 - 1
"MEDIUMTEXT": 16777215, # 2^24 - 1
"LONGTEXT": 4294967295, # 2^32 - 1
}

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# SQLAlchemy dialects
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

DATABRICKS_COLTYPE_TO_LEN = {
# https://docs.databricks.com/en/sql/language-manual/data-types/string-type.html # noqa: E501
"STRING": None # There is no maximum.
}
MSSQL_COLTYPE_TO_LEN = {
# The "N" prefix means Unicode.
# https://docs.microsoft.com/en-us/sql/t-sql/data-types/char-and-varchar-transact-sql?view=sql-server-ver15 # noqa: E501
Expand All @@ -168,16 +144,20 @@
"TEXT": 2**31 - 1,
"NTEXT": 2**30 - 1,
}
MYSQL_COLTYPE_TO_LEN = {
# https://dev.mysql.com/doc/refman/8.0/en/string-type-overview.html
"CHAR": 1, # can specify CHAR(0) to CHAR(255), but if omitted, length is 1
"TINYTEXT": 255, # 2^8 - 1
"TEXT": 65535, # 2^16 - 1
"MEDIUMTEXT": 16777215, # 2^24 - 1
"LONGTEXT": 4294967295, # 2^32 - 1
}


# def combine_db_schema_table(db: Optional[str],
# schema: Optional[str],
# table: str) -> str:
# # ANSI SQL: http://www.contrib.andrew.cmu.edu/~shadow/sql/sql1992.txt
# # <table name>, <qualified name>
# if not table:
# raise ValueError("Missing table supplied to combine_db_schema_table")
# return ".".join(x for x in [db, schema, table] if x)
DIALECT_TO_STRING_LEN_LOOKUP = {
SqlaDialectName.DATABRICKS: DATABRICKS_COLTYPE_TO_LEN,
SqlaDialectName.MSSQL: MSSQL_COLTYPE_TO_LEN,
SqlaDialectName.MYSQL: MYSQL_COLTYPE_TO_LEN,
}


# =============================================================================
Expand Down Expand Up @@ -2314,7 +2294,7 @@ def coltype_length_if_text(column_type: str, dialect: str) -> Optional[int]:
Args:
column_type: SQL column type as a string, e.g. ``"VARCHAR(50)"``
dialect: the sql dialect the column type is from
dialect: the SQL dialect the column type is from
Returns:
length of the column or ``None`` if it's not a text column.
Expand All @@ -2324,43 +2304,41 @@ def coltype_length_if_text(column_type: str, dialect: str) -> Optional[int]:
if column_type in SQLTYPES_TEXT:
# No length specified - get the default
try:
if dialect == SqlaDialectName.MYSQL:
return MYSQL_COLTYPE_TO_LEN[column_type]
elif dialect == SqlaDialectName.MSSQL:
return MSSQL_COLTYPE_TO_LEN[column_type]
else:
raise ValueError(
f"{dialect} is not a valid SQL dialect. Must "
f"be one of: {SqlaDialectName.MYSQL!r}, "
f"{SqlaDialectName.MSSQL!r}"
)
lookup = DIALECT_TO_STRING_LEN_LOOKUP[dialect]
except KeyError:
log.error(
f"SQL dialect {dialect} has no data type " f"{column_type}"
possible = list(DIALECT_TO_STRING_LEN_LOOKUP.keys())
raise ValueError(
f"CRATE doesn't properly understand SQL dialect {dialect!r}. "
f"Supported: {possible}"
)
try:
return lookup[column_type]
except KeyError:
raise ValueError(
f"For SQL dialect {dialect!r}, CRATE doesn't know the length "
f"for string data type {column_type!r}"
)
raise
else:
# Length specified - get it from the column type
try:
m = COLTYPE_WITH_ONE_INTEGER_REGEX.match(column_type)
basetype = m.group(1)
length = m.group(2)
if length == "MAX" or length == "-1":
if basetype == "VARCHAR":
return MSSQL_COLTYPE_TO_LEN["VARCHAR_MAX"]
elif basetype == "NVARCHAR":
return MSSQL_COLTYPE_TO_LEN["NVARCHAR_MAX"]
else:
return None
if dialect == SqlaDialectName.MSSQL:
if basetype == "VARCHAR":
return MSSQL_COLTYPE_TO_LEN["VARCHAR_MAX"]
elif basetype == "NVARCHAR":
return MSSQL_COLTYPE_TO_LEN["NVARCHAR_MAX"]
return None
except AttributeError:
# Not the correct type of column
return None
try:
length = int(length)
return int(length)
except ValueError:
# Not the correct type of column
return None
return length


def escape_quote_in_literal(s: str) -> str:
Expand Down
18 changes: 10 additions & 8 deletions crate_anon/crateweb/research/research_db_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,10 @@
from cardinal_pythonlib.logs import BraceStyleAdapter
from cardinal_pythonlib.reprfunc import auto_repr
from cardinal_pythonlib.sql.sql_grammar import SqlGrammar
from cardinal_pythonlib.sql.validation import (
SQLTYPES_TEXT,
SQLTYPES_WITH_DATE,
)
from cardinal_pythonlib.sqlalchemy.dialect import SqlaDialectName
from cardinal_pythonlib.sqlalchemy.schema import (
MSSQL_DEFAULT_SCHEMA,
Expand All @@ -56,21 +60,19 @@
from crate_anon.common.constants import RUNNING_WITHOUT_CONFIG
from crate_anon.common.sql import (
ColumnId,
is_sql_column_type_textual,
make_grammar,
QB_DATATYPE_DATE,
QB_DATATYPE_FLOAT,
QB_DATATYPE_INTEGER,
QB_DATATYPE_STRING,
QB_DATATYPE_STRING_FULLTEXT,
QB_DATATYPE_UNKNOWN,
SQLTYPES_FLOAT_OR_OTHER_NUMERIC,
SQLTYPES_INTEGER_OR_BIT,
SchemaId,
SqlArgsTupleType,
SQLTYPES_FLOAT,
SQLTYPES_WITH_DATE,
SQLTYPES_TEXT,
SQLTYPES_INTEGER,
TableId,
is_sql_column_type_textual,
make_grammar,
translate_sql_qmark_to_percent,
)
from crate_anon.crateweb.core.constants import SettingsKeys
Expand Down Expand Up @@ -151,7 +153,7 @@ def querybuilder_type(self) -> str:
defines our field type, like ``"int"`` or ``"date"``. See source.
"""
basetype = self.basetype
if basetype in SQLTYPES_FLOAT:
if basetype in SQLTYPES_FLOAT_OR_OTHER_NUMERIC:
return QB_DATATYPE_FLOAT
if basetype in SQLTYPES_WITH_DATE:
return QB_DATATYPE_DATE
Expand All @@ -160,7 +162,7 @@ def querybuilder_type(self) -> str:
return QB_DATATYPE_STRING_FULLTEXT
else:
return QB_DATATYPE_STRING
if basetype in SQLTYPES_INTEGER:
if basetype in SQLTYPES_INTEGER_OR_BIT:
return QB_DATATYPE_INTEGER
return QB_DATATYPE_UNKNOWN

Expand Down
2 changes: 1 addition & 1 deletion crate_anon/preprocess/systmone_ddgen.py
Original file line number Diff line number Diff line change
Expand Up @@ -427,6 +427,7 @@

from cardinal_pythonlib.dicts import reversedict
from cardinal_pythonlib.enumlike import CaseInsensitiveEnumMeta
from cardinal_pythonlib.sql.validation import SQLTYPE_DATE

from crate_anon.anonymise.altermethod import AlterMethod
from crate_anon.anonymise.constants import (
Expand All @@ -437,7 +438,6 @@
SrcFlag,
)
from crate_anon.common.logfunc import warn_once
from crate_anon.common.sql import SQLTYPE_DATE
from crate_anon.anonymise.dd import DataDictionary, DataDictionaryRow
from crate_anon.preprocess.constants import CRATE_COL_PK

Expand Down
40 changes: 40 additions & 0 deletions devnotes/2025_01_sqlalchemy2_databricks/notes_databricks.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
===============================================================================
CHANGES FOR DATABRICKS SUPPORT
===============================================================================

Databricks is a cloud-based data platform:
https://en.wikipedia.org/wiki/Databricks.


-------------------------------------------------------------------------------
Can you run Databricks locally?
-------------------------------------------------------------------------------

Maybe not. This set of instructions about a Docker image look ultimately to be
about creating a Docker image that you can push to an existing Databricks
cluster and run there:

https://medium.com/@d.v.rademaker/do-it-yourself-building-your-own-databricks-docker-container-9cd670612927


-------------------------------------------------------------------------------
Databricks for SQLAlchemy
-------------------------------------------------------------------------------

Tutorial for Azure Databricks:

https://learn.microsoft.com/en-us/azure/databricks/dev-tools/sqlalchemy

... has a broken link to its example sqlalchemy.py

The basic SQL connector for Databricks is:

https://github.com/databricks/databricks-sql-python

The SQLAlchemy dialect is:

https://github.com/databricks/databricks-sqlalchemy

To import it:

from databricks.sqlalchemy import DatabricksDialect
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@ Migration to 2.0 Step Three - Resolve all RemovedIn20Warnings
- pytest
... remember pytest -k; pytest --log-cli-level
- crate_make_demo_database <URL>
- crate_anon_draft_dd
- crate_anonymise --full
- crate_anonymise --incremental
- crate_nlp --nlpdef crate_biomarkers --full
Expand Down
18 changes: 18 additions & 0 deletions devnotes/2025_01_sqlalchemy2_databricks/pipeline_test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#!/bin/bash

set -ex

if [ -z "$TMP_CRATE_DEMO_DATABASE_URL" ]; then
echo "Please set environment variable TMP_CRATE_DEMO_DATABASE_URL first."
exit 1
fi

crate_make_demo_database "${TMP_CRATE_DEMO_DATABASE_URL}"
crate_anon_draft_dd
crate_anonymise --full
crate_anonymise --incremental
crate_nlp --nlpdef crate_biomarkers --full
crate_nlp --nlpdef crate_biomarkers --incremental
crate_researcher_report ~/Downloads/tmp_crate_researcher_report.pdf

pytest # Do this last: warnings (which may be OK) cause exit code failure.
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,8 @@
"appdirs==1.4.4", # where to store some temporary data
"arrow==0.15.7", # [pin exact version from cardinal_pythonlib]
"beautifulsoup4==4.9.1", # [pin exact version from cardinal_pythonlib]
"cardinal_pythonlib==2.0.0", # RNC libraries
# "cardinal_pythonlib==2.0.0", # RNC libraries
"cardinal_pythonlib @ git+https://github.com/RudolfCardinal/pythonlib@sqlalchemy2#egg=cardinal_pythonlib-2.0.0-rc1", # RNC libraries, development version # noqa: E501
"cairosvg==2.7.0", # work with SVG files
"celery==5.2.7", # back-end scheduling
"chardet==3.0.4", # character encoding detection for cardinal_pythonlib
Expand Down

0 comments on commit c10aaa9

Please sign in to comment.