From c97903f730cb781b0ebfe909720894f84a174f70 Mon Sep 17 00:00:00 2001 From: David Reinhart Date: Mon, 27 Nov 2023 15:06:03 -0800 Subject: [PATCH] Include `access_role` value in genomic sequencing ETL An `access_role` column was recently added to the `sample`, `sequencing_read_set`, `genomic_sequence`, and `consensus_genome` tables of the `warehouse` schema to allow data with restricted access to be included in these tables. These changes include `access_role` in the appropriate class definitions, and updates ETLs to propogate the `access_role` value found in the `sample` record to the related records. The sample record will continue server as the primary `access_role` value, but must also be stored in each table that uses row level security. --- lib/id3c/cli/command/etl/__init__.py | 4 ++-- lib/id3c/cli/command/etl/consensus_genome.py | 23 ++++++++++++-------- lib/id3c/db/types.py | 4 ++++ 3 files changed, 20 insertions(+), 11 deletions(-) diff --git a/lib/id3c/cli/command/etl/__init__.py b/lib/id3c/cli/command/etl/__init__.py index d6450a50..391b155f 100644 --- a/lib/id3c/cli/command/etl/__init__.py +++ b/lib/id3c/cli/command/etl/__init__.py @@ -287,7 +287,7 @@ def update_sample(db: DatabaseSession, update warehouse.sample set encounter_id = %s where sample_id = %s - returning sample_id as id, identifier + returning sample_id as id, identifier, access_role """, (encounter_id, sample.id)) assert sample.id, "Updating encounter_id affected no rows!" @@ -341,7 +341,7 @@ def find_sample(db: DatabaseSession, identifier: str, for_update = True) -> Any: query_ending = "for update" sample = db.fetch_row(""" - select sample_id as id, identifier, encounter_id + select sample_id as id, identifier, encounter_id, access_role from warehouse.sample where identifier = %s or collection_identifier = %s diff --git a/lib/id3c/cli/command/etl/consensus_genome.py b/lib/id3c/cli/command/etl/consensus_genome.py index 878f24e7..0d3a8604 100644 --- a/lib/id3c/cli/command/etl/consensus_genome.py +++ b/lib/id3c/cli/command/etl/consensus_genome.py @@ -138,7 +138,7 @@ def find_or_create_sequence_read_set(db: DatabaseSession, document: dict, sample """)) sequence_read_set: SequenceReadSetRecord = db.fetch_row(""" - select sequence_read_set_id as id, sample_id, urls + select sequence_read_set_id as id, sample_id, urls, access_role from warehouse.sequence_read_set where sample_id = %s and urls @> %s @@ -147,6 +147,9 @@ def find_or_create_sequence_read_set(db: DatabaseSession, document: dict, sample if sequence_read_set: LOG.info(f"Found sequence read set {sequence_read_set.id}") + if sample.access_role: + assert sample.access_role == sequence_read_set.access_role, \ + f"Access_role for sample id «{sample.id}» does not match sequence read set id «{sequence_read_set.id}» " else: LOG.debug(dedent(f""" Sequence read set not found for sample id «{sample.id}» and urls {urls} @@ -154,13 +157,14 @@ def find_or_create_sequence_read_set(db: DatabaseSession, document: dict, sample data = { "sample_id": sample.id, + "access_role": sample.access_role, "urls": urls, } sequence_read_set = db.fetch_row(""" - insert into warehouse.sequence_read_set (sample_id, urls) - values (%(sample_id)s, %(urls)s) - returning sequence_read_set_id as id, sample_id, urls + insert into warehouse.sequence_read_set (sample_id, urls, access_role) + values (%(sample_id)s, %(urls)s, %(access_role)s) + returning sequence_read_set_id as id, sample_id, urls, access_role """, data) LOG.info(f"Created sequence read set {sequence_read_set.id}") @@ -254,19 +258,20 @@ def upsert_genome(db: DatabaseSession, sequence_read_set: SequenceReadSetRecord, "sample_id": sequence_read_set.sample_id, "organism_id": organism.id, "sequence_read_set_id": sequence_read_set.id, - "additional_details": Json(document['summary_stats']) + "additional_details": Json(document['summary_stats']), + "access_role": sequence_read_set.access_role } genome: GenomeRecord = db.fetch_row(""" insert into warehouse.consensus_genome (sample_id, organism_id, - sequence_read_set_id, details) + sequence_read_set_id, details, access_role) values (%(sample_id)s, %(organism_id)s, %(sequence_read_set_id)s, - %(additional_details)s) + %(additional_details)s, %(access_role)s) on conflict (sample_id, organism_id, sequence_read_set_id) do update - set details = %(additional_details)s + set details = %(additional_details)s, access_role = %(access_role)s - returning consensus_genome_id as id, sample_id, organism_id, sequence_read_set_id + returning consensus_genome_id as id, sample_id, organism_id, sequence_read_set_id, access_role """, data) assert genome.id, "Upsert affected no rows!" diff --git a/lib/id3c/db/types.py b/lib/id3c/db/types.py index 054193ab..01a47e33 100644 --- a/lib/id3c/db/types.py +++ b/lib/id3c/db/types.py @@ -14,12 +14,14 @@ class IdentifierRecord(NamedTuple): class MinimalSampleRecord(NamedTuple): id: int identifier: str + access_role: Optional[str] class SampleRecord(NamedTuple): id: int identifier: str encounter_id: Optional[int] type: Optional[str] + access_role: Optional[str] class KitRecord(NamedTuple): id: int @@ -36,12 +38,14 @@ class SequenceReadSetRecord(NamedTuple): id: int sample_id: int urls: Optional[List[str]] + access_role: Optional[str] class GenomeRecord(NamedTuple): id: int sample_id: int organism_id: int sequence_read_set_id: int + access_role: Optional[str] class MinimalLocationRecord(NamedTuple): id: int