Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move to SQLAlchemy 2 #187

Draft
wants to merge 40 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 36 commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
ac42cc9
Revert "Undo commits accidentally pushed to master"
martinburchell Nov 22, 2024
2736027
Interim version number update for Docker
martinburchell Nov 22, 2024
8019dd9
Update minimum Python version
martinburchell Nov 22, 2024
36b1c09
Update version numbers in docs
martinburchell Nov 22, 2024
2918a15
Work on handling multi-table NLP cloud results (and some noqa specifi…
RudolfCardinal Dec 17, 2024
da28b68
Adding support for cloud NLP tabular_schema
RudolfCardinal Dec 18, 2024
5e0aaae
Revert overzealous PyCharm refactor-renaming of engine to dummy_engine
RudolfCardinal Dec 18, 2024
9d41e15
tweaks
RudolfCardinal Dec 18, 2024
165a613
Fix some unit tests
RudolfCardinal Dec 18, 2024
e5b784f
Temporary version number update
martinburchell Dec 19, 2024
67b8ffa
Merge branch 'master' into cpft-dev
martinburchell Dec 19, 2024
8bad34d
Use dictionary format by default for NLPRP results, and test that too
RudolfCardinal Dec 19, 2024
77c5ac3
Merge branch 'master' into cpft-dev
martinburchell Dec 19, 2024
bc09c8f
Use dictionary format by default for NLPRP results, and test that too
RudolfCardinal Dec 19, 2024
0d48ac0
Use dictionary format by default for NLPRP results, and test that too
RudolfCardinal Dec 19, 2024
6939d57
merge in fix_nlprp_client_tabular_schema
RudolfCardinal Dec 19, 2024
1a59900
bump to rc3
RudolfCardinal Dec 19, 2024
e0d701f
working on SQLA2
RudolfCardinal Dec 19, 2024
a2dce99
Improve error reporting from remote NLPRP problems
RudolfCardinal Dec 19, 2024
edc30b3
Merge branch 'fix_nlprp_client_tabular_schema' into sqlalchemy2
RudolfCardinal Dec 19, 2024
c02f6bd
SQLA2 and noqa work
RudolfCardinal Jan 6, 2025
908d82b
SQLA2 and noqa work
RudolfCardinal Jan 6, 2025
3c179a5
Fix a PDF-reading change in ResearcherReportTests
RudolfCardinal Jan 7, 2025
2ebf42a
SQLAlchemy 2.0.36 operational
RudolfCardinal Jan 8, 2025
c10aaa9
Databricks support
RudolfCardinal Jan 8, 2025
ab1668e
try to fix wkhtmltopdf installer by using gdebi not dpkg -i
RudolfCardinal Jan 8, 2025
cf7f1f4
merge in master, fix conflicts
RudolfCardinal Jan 8, 2025
2f459c7
Try to make research_database_info less singleton-like; fix test suit…
RudolfCardinal Jan 9, 2025
6370f07
fix autodocs
RudolfCardinal Jan 9, 2025
f1bb096
bugfix
RudolfCardinal Jan 9, 2025
2f624fb
alter Django test cases to allow database attempts
RudolfCardinal Jan 9, 2025
3fc7139
alter Django test cases to allow database attempts
RudolfCardinal Jan 9, 2025
bac3b20
Merge branch 'master' into sqlalchemy2
martinburchell Jan 10, 2025
08bf0cf
noqa fixes
martinburchell Jan 10, 2025
b734d5d
Fix raw SQL execution in test
martinburchell Jan 13, 2025
1748027
Update table of third party dependencies
martinburchell Jan 13, 2025
8b1cc50
Add explicit configuration for Read the Docs
martinburchell Jan 13, 2025
7502e60
delete superfluous upsert_test_1.sql
RudolfCardinal Jan 14, 2025
25475cd
Shift setup.py to cardinal_pythonlib==2.0.0
RudolfCardinal Jan 14, 2025
a8c1cda
Remove commented-out pdb.set_trace()
RudolfCardinal Jan 14, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,7 @@ repos:
rev: v1.5.0
hooks:
- id: yesqa
- repo: https://github.com/pre-commit/pygrep-hooks
rev: v1.9.0
hooks:
- id: python-check-blanket-noqa
5 changes: 3 additions & 2 deletions crate_anon/ancillary/timely_project/timely_filter_cpft_rio.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,8 @@ def __init__(self) -> None:
self.exclude_tables,
stage=None,
regex_strings=[
"cpft_core_assessment_v2_kcsa_children_in_household", # about people other than the patient # noqa
"cpft_core_assessment_v2_kcsa_children_in_household",
# ... about people other than the patient
],
)

Expand Down Expand Up @@ -183,7 +184,7 @@ def __init__(self) -> None:
"CPFT_Core_Assessment.*",
"Progress_Note",
"RskRelatedIncidents",
"UserAssessCAMH", # CAMH-specific assessments (e.g. questionnaires) -- can have free-text comments. # noqa
"UserAssessCAMH", # CAMH-specific assessments (e.g. questionnaires) -- can have free-text comments. # noqa: E501
],
)

Expand Down
6 changes: 3 additions & 3 deletions crate_anon/anonymise/anonregex.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,7 +282,7 @@ def get_generic_date_regex_elements(
Word boundaries are strongly preferred! This will match some odd things
otherwise; see the associated unit tests.
"""
# https://stackoverflow.com/questions/51224/regular-expression-to-match-valid-dates # noqa
# https://stackoverflow.com/questions/51224/regular-expression-to-match-valid-dates # noqa: E501

# range [1, 31]
numeric_day = named_capture_group(
Expand Down Expand Up @@ -447,7 +447,7 @@ def get_code_regex_elements(
else:
if at_numeric_boundaries_only:
# http://www.regular-expressions.info/lookaround.html
# https://stackoverflow.com/questions/15099150/regex-find-one-digit-number # noqa
# https://stackoverflow.com/questions/15099150/regex-find-one-digit-number # noqa: E501
return [NOT_DIGIT_LOOKBEHIND + s + NOT_DIGIT_LOOKAHEAD]
else:
return [s]
Expand Down Expand Up @@ -527,7 +527,7 @@ def get_uk_postcode_regex_elements(
See:

- https://stackoverflow.com/questions/164979/regex-for-matching-uk-postcodes
""" # noqa
""" # noqa: E501
# -------------------------------------------------------------------------
# Old
# -------------------------------------------------------------------------
Expand Down
39 changes: 20 additions & 19 deletions crate_anon/anonymise/anonymise.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@

from cardinal_pythonlib.datetimefunc import get_now_utc_pendulum
from cardinal_pythonlib.sqlalchemy.core_query import count_star, exists_plain
from cardinal_pythonlib.sqlalchemy.insert_on_duplicate import (
insert_with_upsert_if_supported,
)
from cardinal_pythonlib.sqlalchemy.schema import (
add_index,
get_column_names,
Expand Down Expand Up @@ -277,18 +280,14 @@ def insert(records_: List[Dict[str, Any]]) -> None:
commit_destdb()

# 4. Index -- no, hang on, it's a primary key already
#
# log.debug("... creating index on temporary table")
# index = Index('_temptable_idx', temptable.columns[pkfield])
# index.create(destengine)

# 5. DELETE FROM desttable
# WHERE destpk NOT IN (SELECT srcpk FROM temptable)
log.debug("... deleting from destination where appropriate")
query = dest_table.delete().where(
~column(pkddr.dest_field).in_(select([temptable.columns[pkfield]]))
~column(pkddr.dest_field).in_(select(temptable.columns[pkfield]))
)
destengine.execute(query)
destsession.execute(query)
commit_destdb()

# 6. Drop temporary table
Expand Down Expand Up @@ -392,7 +391,7 @@ def get_valid_pid_subset(given_pids: List[str]) -> List[str]:
pidcol = column(ddr.src_field)
session = config.sources[ddr.src_db].session
query = (
select([pidcol])
select(pidcol)
.select_from(table(ddr.src_table))
.where(pidcol is not None)
.distinct()
Expand Down Expand Up @@ -477,7 +476,7 @@ def get_pid_subset_from_field(
session = config.sources[ddr.src_db].session
# Find pids corresponding to the given values of specified field
query = (
select([pidcol])
select(pidcol)
.select_from(table(ddr.src_table))
.where((fieldcol.in_(values_to_find)) & (pidcol is not None))
.distinct()
Expand All @@ -503,7 +502,7 @@ def get_pid_subset_from_field(
# join_obj = ddr_table.join(chosen_table,
# chosen_table.c.fieldcol == ddr_table.c.pidcol)
# query = (
# select([pidcol]).
# select(pidcol).
# select_from(join_obj).
# where((chosen_table.fieldcol.in_(field_elements)) &
# (ddr_table.pidcol is not None)).
Expand Down Expand Up @@ -644,7 +643,7 @@ def get_pids_from_limits(low: int, high: int) -> List[Any]:
pidcol = column(ddr.src_field)
session = config.sources[ddr.src_db].session
query = (
select([pidcol])
select(pidcol)
.select_from(table(ddr.src_table))
.where((pidcol.between(low, high)) & (pidcol is not None))
.distinct()
Expand Down Expand Up @@ -721,7 +720,7 @@ def get_pids_query_field_limits(field: str, low: int, high: int) -> List[Any]:
session = config.sources[ddr.src_db].session
# Find pids corresponding to the given values of specified field
query = (
select([pidcol])
select(pidcol)
.select_from(table(ddr.src_table))
.where((fieldcol.between(low, high)) & (pidcol is not None))
.distinct()
Expand Down Expand Up @@ -875,7 +874,7 @@ def gen_patient_ids(
session = config.sources[ddr.src_db].session
pidcol = column(ddr.src_field)
query = (
select([pidcol])
select(pidcol)
.select_from(table(ddr.src_table))
.where(pidcol is not None)
.distinct()
Expand Down Expand Up @@ -990,7 +989,7 @@ def gen_rows(
``sourcefields``
"""
t = config.sources[dbname].metadata.tables[sourcetable]
q = select([column(c) for c in sourcefields]).select_from(t)
q = select(*[column(c) for c in sourcefields]).select_from(t)
# not ordered

# Restrict to one patient?
Expand Down Expand Up @@ -1042,7 +1041,7 @@ def count_rows(
"""
# Count function to match gen_rows()
session = config.sources[dbname].session
query = select([func.count()]).select_from(table(sourcetable))
query = select(func.count()).select_from(table(sourcetable))
if pid is not None:
pidcol_name = config.dd.get_pid_name(dbname, sourcetable)
query = query.where(column(pidcol_name) == pid)
Expand Down Expand Up @@ -1138,7 +1137,7 @@ def gen_pks(
"""
db = config.sources[srcdbname]
t = db.metadata.tables[tablename]
q = select([column(pkname)]).select_from(t)
q = select(column(pkname)).select_from(t)
result = db.session.execute(q)
for row in result:
yield row[0]
Expand Down Expand Up @@ -1403,7 +1402,9 @@ def process_table(
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Insert values into database
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
q = sqla_table.insert_on_duplicate().values(destvalues)
q = insert_with_upsert_if_supported(
table=sqla_table, values=destvalues, session=session
)
session.execute(q)

# Trigger an early commit?
Expand Down Expand Up @@ -1635,9 +1636,9 @@ def insert(records_: List[Dict[str, Any]]) -> None:
log.debug(start + f": ... {dest_table_name}")
dest_table = config.dd.get_dest_sqla_table(dest_table_name)
query = dest_table.delete().where(
column(ridfield).in_(select([temptable.columns[pkfield]]))
column(ridfield).in_(select(temptable.columns[pkfield]))
)
destengine.execute(query)
destsession.execute(query)
commit_destdb()

log.debug(start + ": 6. dropping temporary table")
Expand Down Expand Up @@ -1836,7 +1837,7 @@ def gen_opt_out_pids_from_database(
optout_colname, optout_col_values
)

query = select([idcol]).select_from(sqla_table).distinct()
query = select(idcol).select_from(sqla_table).distinct()

if optout_col_values:
# Note that if optout_col_values does not contain valid values,
Expand Down
24 changes: 4 additions & 20 deletions crate_anon/anonymise/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,14 +87,8 @@
ensure_valid_field_name,
ensure_valid_table_name,
)
from cardinal_pythonlib.sqlalchemy.schema import (
hack_in_mssql_xml_type,
is_sqlatype_integer,
)
from cardinal_pythonlib.sqlalchemy.schema import is_sqlatype_integer
from cardinal_pythonlib.sizeformatter import sizeof_fmt
from cardinal_pythonlib.sqlalchemy.insert_on_duplicate import (
monkeypatch_TableClause,
)
import regex
from sqlalchemy import BigInteger, create_engine, String
from sqlalchemy.dialects.mssql.base import dialect as ms_sql_server_dialect
Expand Down Expand Up @@ -134,11 +128,6 @@

log = logging.getLogger(__name__)

# The Config class is loaded very early, via the nasty singleton.
# This is therefore the appropriate moment to make any changes to SQLAlchemy.
monkeypatch_TableClause()
hack_in_mssql_xml_type() # support XML type under SQL Server


# =============================================================================
# Config/databases
Expand Down Expand Up @@ -456,7 +445,7 @@ def get_word_alternatives(filenames: List[str]) -> List[List[str]]:
Returns:
a list of lists of equivalent words

""" # noqa
""" # noqa: E501
alternatives = [] # type: List[List[str]]
all_words_seen = set() # type: Set[str]
for filename in filenames:
Expand Down Expand Up @@ -987,9 +976,7 @@ def get_database(
self._dest_bytes_written = 0
self._echo = False

def get_destdb_engine_outside_transaction(
self, encoding: str = "utf-8"
) -> Engine:
def get_destdb_engine_outside_transaction(self) -> Engine:
"""
Get a standalone SQLAlchemy Engine for the destination database, and
configure itself so transactions aren't used (equivalently:
Expand All @@ -999,18 +986,15 @@ def get_destdb_engine_outside_transaction(
See
https://github.com/mkleehammer/pyodbc/wiki/Database-Transaction-Management

Args:
encoding: passed to the SQLAlchemy :func:`create_engine` function

Returns:
the Engine
"""
url = self._destination_database_url
return create_engine(
url,
encoding=encoding,
echo=self._echo,
connect_args={"autocommit": True}, # for pyodbc
future=True,
)

def overall_progress(self) -> str:
Expand Down
4 changes: 2 additions & 2 deletions crate_anon/anonymise/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ class SrcFlag(StrEnum):
# MySQL: 64 -- http://dev.mysql.com/doc/refman/5.7/en/identifiers.html
SQLSERVER_MAX_IDENTIFIER_LENGTH = 128
# Microsoft SQL Server: 128 --
# https://docs.microsoft.com/en-us/sql/relational-databases/databases/database-identifiers # noqa
# https://docs.microsoft.com/en-us/sql/relational-databases/databases/database-identifiers # noqa: E501


# =============================================================================
Expand Down Expand Up @@ -861,7 +861,7 @@ class HashConfigKeys:
{_SK.DDGEN_FILENAME_TO_TEXT_FIELDS} =
{_SK.DDGEN_BINARY_TO_TEXT_FIELD_PAIRS} =

""" # noqa
""" # noqa: E501

# For the style:
# [source_databases]
Expand Down
19 changes: 9 additions & 10 deletions crate_anon/anonymise/dbholder.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,9 @@
import logging
from typing import List, Optional, TYPE_CHECKING

from sqlalchemy import create_engine
from sqlalchemy.engine import create_engine
from sqlalchemy.engine.base import Connection
from sqlalchemy.orm import sessionmaker
from sqlalchemy.orm.session import Session
from sqlalchemy.orm.session import sessionmaker, Session
from sqlalchemy.sql.schema import MetaData

if TYPE_CHECKING:
Expand All @@ -60,7 +59,6 @@ def __init__(
with_session: bool = False,
with_conn: bool = True,
reflect: bool = True,
encoding: str = "utf-8",
echo: bool = False,
) -> None:
"""
Expand All @@ -71,18 +69,17 @@ def __init__(
with_session: create an SQLAlchemy Session?
with_conn: create an SQLAlchemy connection (via an Engine)?
reflect: read the database structure (when required)?
encoding: passed to SQLAlchemy's :func:`create_engine`
echo: passed to SQLAlchemy's :func:`create_engine`
"""
self.name = name
self.srccfg = srccfg
self.engine = create_engine(url, encoding=encoding, echo=echo)
self.engine = create_engine(url, echo=echo, future=True)
self.conn = None # type: Optional[Connection]
self.session = None # type: Optional[Session]
self._reflect_on_request = reflect
self._reflected = False
self._table_names = [] # type: List[str]
self._metadata = MetaData(bind=self.engine)
self._metadata = MetaData()
log.debug(self.engine) # obscures password

if with_conn: # for raw connections
Expand All @@ -101,7 +98,9 @@ def create_session(self) -> None:
Creates a database session, if not created to begin with.
"""
if not self.session:
self.session = sessionmaker(bind=self.engine)() # for ORM
self.session = sessionmaker(
bind=self.engine, future=True
)() # type: Session

def _reflect(self) -> None:
"""
Expand All @@ -113,15 +112,15 @@ def _reflect(self) -> None:
return
log.info(f"Reflecting database: {self.name}")
# self.table_names = get_table_names(self.engine)
self._metadata.reflect(views=True) # include views
self._metadata.reflect(bind=self.engine, views=True) # include views
self._table_names = [t.name for t in self._metadata.sorted_tables]
self._reflected = True

def update_metadata(self) -> None:
"""
Updates the metadata, for example if a table has been dropped.
"""
self._metadata = MetaData(bind=self.engine)
self._metadata = MetaData()

@property
def metadata(self) -> MetaData:
Expand Down
4 changes: 2 additions & 2 deletions crate_anon/anonymise/dd.py
Original file line number Diff line number Diff line change
Expand Up @@ -571,7 +571,7 @@ def tidy_draft(self) -> None:
CREATE TABLE junk (intthing INT PRIMARY KEY, text1 LONGTEXT, text2 LONGTEXT);
ALTER TABLE junk ADD FULLTEXT INDEX ftidx1 (text1);
ALTER TABLE junk ADD FULLTEXT INDEX ftidx2 (text2); -- OK
""" # noqa
""" # noqa: E501
log.info("Tidying/correcting draft data dictionary")

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Expand All @@ -591,7 +591,7 @@ def tidy_draft(self) -> None:
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
log.info("... Make full-text indexes follow dialect rules")
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# https://docs.microsoft.com/en-us/sql/t-sql/statements/create-fulltext-index-transact-sql?view=sql-server-ver15 # noqa
# https://docs.microsoft.com/en-us/sql/t-sql/statements/create-fulltext-index-transact-sql?view=sql-server-ver15 # noqa: E501
if self.dest_dialect_name == SqlaDialectName.SQLSERVER:
# -----------------------------------------------------------------
# Checks for Microsoft SQL Server
Expand Down
4 changes: 2 additions & 2 deletions crate_anon/anonymise/ddr.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
ensure_valid_field_name,
ensure_valid_table_name,
is_sqltype_valid,
SQLTYPE_DATE,
)
from cardinal_pythonlib.sqlalchemy.dialect import SqlaDialectName
from cardinal_pythonlib.sqlalchemy.schema import (
Expand Down Expand Up @@ -78,7 +79,6 @@
is_sql_column_type_textual,
matches_fielddef,
matches_tabledef,
SQLTYPE_DATE,
)

if TYPE_CHECKING:
Expand Down Expand Up @@ -1045,7 +1045,7 @@ def dest_sqla_column(self) -> Column:
kwargs = {
"doc": comment, # Python side
"comment": comment, # SQL side; supported from SQLAlchemy 1.2:
# https://docs.sqlalchemy.org/en/14/core/metadata.html#sqlalchemy.schema.Column.params.comment # noqa
# https://docs.sqlalchemy.org/en/14/core/metadata.html#sqlalchemy.schema.Column.params.comment # noqa: E501
}
if self.pk:
kwargs["primary_key"] = True
Expand Down
Loading