From 8952356beedc240cee103484e52c3617b247ab27 Mon Sep 17 00:00:00 2001 From: Michael Thornton Date: Wed, 20 Nov 2024 15:39:52 -0800 Subject: [PATCH 1/4] update tests and fixtures to replicate error --- .../workflow_automation/workflow_process.py | 4 +++ tests/fixtures/nmdc_db/data_object_set.json | 13 +++++++++ .../nmdc_db/legacy_data_generation.json | 27 +++++++++++++++++++ .../nmdc_db/metagenome_sequencing.json | 19 +++++++++++++ tests/test_workflow_process.py | 17 ++++++++++++ 5 files changed, 80 insertions(+) create mode 100644 tests/fixtures/nmdc_db/legacy_data_generation.json create mode 100644 tests/fixtures/nmdc_db/metagenome_sequencing.json diff --git a/nmdc_automation/workflow_automation/workflow_process.py b/nmdc_automation/workflow_automation/workflow_process.py index 9e676a83..6a68c237 100644 --- a/nmdc_automation/workflow_automation/workflow_process.py +++ b/nmdc_automation/workflow_automation/workflow_process.py @@ -100,7 +100,9 @@ def get_current_workflow_process_nodes( data_generation_ids = set() data_generation_workflows = [wf for wf in workflows if wf.collection == "data_generation_set"] + workflow_execution_workflows = [wf for wf in workflows if wf.collection == "workflow_execution_set"] + # screen out legacy metagenome sequencing workflows # default query for data_generation_set records filtered by analyte category q = {"analyte_category": analyte_category} @@ -129,6 +131,8 @@ def get_current_workflow_process_nodes( records = db[wf.collection].find(q) for rec in records: + # if rec.get("type") == "nmdc:MetagenomeSequencing" or rec["name"].startswith("Metagenome Sequencing"): + # continue if wf.version and not _within_range(rec["version"], wf.version): continue if _is_missing_required_input_output(wf, rec, data_objects_by_id): diff --git a/tests/fixtures/nmdc_db/data_object_set.json b/tests/fixtures/nmdc_db/data_object_set.json index 0918ae38..e37b410c 100644 --- a/tests/fixtures/nmdc_db/data_object_set.json +++ b/tests/fixtures/nmdc_db/data_object_set.json @@ -825,5 +825,18 @@ "alternative_identifiers" : [ "emsl:output_456424" ] +}, + { + "id" : "nmdc:dobj-11-hnw52332", + "name" : "52832.3.464616.CTATCGCA-CTATCGCA.fastq.gz", + "description" : "Metagenome Raw Reads for nmdc:omprc-11-cegmwy02", + "alternative_identifiers" : [ + + ], + "file_size_bytes" : 31068664547, + "md5_checksum" : "12f380b91ff3364cd3d228505d3402b5", + "data_object_type" : "Metagenome Raw Reads", + "url" : "https://data.microbiomedata.org/data/nmdc:omprc-11-cegmwy02/nmdc:wfmsa-11-jc5cmf37.1/52832.3.464616.CTATCGCA-CTATCGCA.fastq.gz", + "type" : "nmdc:DataObject" } ] \ No newline at end of file diff --git a/tests/fixtures/nmdc_db/legacy_data_generation.json b/tests/fixtures/nmdc_db/legacy_data_generation.json new file mode 100644 index 00000000..164547ad --- /dev/null +++ b/tests/fixtures/nmdc_db/legacy_data_generation.json @@ -0,0 +1,27 @@ +[ + { + "id" : "nmdc:omprc-11-cegmwy02", + "name" : "Terrestrial soil microbial communities - BONA_004-O-20210707-COMP", + "has_input" : [ + "nmdc:procsm-11-d8hkca85" + ], + "gold_sequencing_project_identifiers" : [ + "gold:Gp0704890" + ], + "processing_institution" : "JGI", + "type" : "nmdc:NucleotideSequencing", + "insdc_bioproject_identifiers" : [ + "bioproject:PRJNA1029072" + ], + "analyte_category" : "metagenome", + "associated_studies" : [ + "nmdc:sty-11-34xj1150" + ], + "instrument_used" : [ + "nmdc:inst-14-mr4r2w09" + ], + "has_output" : [ + "nmdc:dobj-11-hnw52332" + ] +} +] \ No newline at end of file diff --git a/tests/fixtures/nmdc_db/metagenome_sequencing.json b/tests/fixtures/nmdc_db/metagenome_sequencing.json new file mode 100644 index 00000000..b695d46e --- /dev/null +++ b/tests/fixtures/nmdc_db/metagenome_sequencing.json @@ -0,0 +1,19 @@ +[ + { + "id" : "nmdc:wfmsa-11-jc5cmf37.1", + "name" : "Metagenome Sequencing Activity for nmdc:wfmsa-11-jc5cmf37.1", + "started_at_time" : "2023-09-13T19:57:49.595727+00:00", + "ended_at_time" : "2023-09-13T19:57:49.595743+00:00", + "was_informed_by" : "nmdc:omprc-11-cegmwy02", + "execution_resource" : "JGI", + "git_url" : "https://github.com/microbiomedata/RawSequencingData", + "has_input" : [ + "nmdc:procsm-11-d8hkca85" + ], + "has_output" : [ + "nmdc:dobj-11-hnw52332" + ], + "type" : "nmdc:MetagenomeSequencing", + "version" : "v1.0.0" +} +] \ No newline at end of file diff --git a/tests/test_workflow_process.py b/tests/test_workflow_process.py index 1cf81058..f4c70852 100644 --- a/tests/test_workflow_process.py +++ b/tests/test_workflow_process.py @@ -45,6 +45,23 @@ def test_load_workflow_process_nodes(test_db, workflow_file, workflows_config_di assert data_gen.children[0].type == "nmdc:ReadQcAnalysis" +def test_load_workflow_process_nodes_does_not_load_metagenome_sequencing(test_db, workflows_config_dir): + """ + Test that legacy nmdc:MetagenomeSequencing instances are not loaded + """ + reset_db(test_db) + load_fixture(test_db, "data_object_set.json") + load_fixture(test_db, "legacy_data_generation.json", "data_generation_set") + load_fixture(test_db, "metagenome_sequencing.json", "workflow_execution_set") + + wfs = load_workflow_configs(workflows_config_dir / "workflows.yaml") + data_objs_by_id = get_required_data_objects_map(test_db, wfs) + wf_execs = get_current_workflow_process_nodes(test_db, wfs, data_objs_by_id, allowlist=["nmdc:omprc-11-cegmwy02",]) + # there should be no metagenome sequencing instances + assert not wf_execs + + + @mark.parametrize( "workflow_file", ["workflows.yaml", "workflows-mt.yaml"] ) From 9a38244dc6c2ab795c518b532bd962ab0af085c0 Mon Sep 17 00:00:00 2001 From: Michael Thornton Date: Wed, 20 Nov 2024 15:49:44 -0800 Subject: [PATCH 2/4] update fixtures and tests --- tests/fixtures/nmdc_db/data_object_set.json | 13 ------------- tests/fixtures/nmdc_db/legacy_data_obj.json | 15 +++++++++++++++ tests/test_workflow_process.py | 15 ++++++++++----- 3 files changed, 25 insertions(+), 18 deletions(-) create mode 100644 tests/fixtures/nmdc_db/legacy_data_obj.json diff --git a/tests/fixtures/nmdc_db/data_object_set.json b/tests/fixtures/nmdc_db/data_object_set.json index e37b410c..0918ae38 100644 --- a/tests/fixtures/nmdc_db/data_object_set.json +++ b/tests/fixtures/nmdc_db/data_object_set.json @@ -825,18 +825,5 @@ "alternative_identifiers" : [ "emsl:output_456424" ] -}, - { - "id" : "nmdc:dobj-11-hnw52332", - "name" : "52832.3.464616.CTATCGCA-CTATCGCA.fastq.gz", - "description" : "Metagenome Raw Reads for nmdc:omprc-11-cegmwy02", - "alternative_identifiers" : [ - - ], - "file_size_bytes" : 31068664547, - "md5_checksum" : "12f380b91ff3364cd3d228505d3402b5", - "data_object_type" : "Metagenome Raw Reads", - "url" : "https://data.microbiomedata.org/data/nmdc:omprc-11-cegmwy02/nmdc:wfmsa-11-jc5cmf37.1/52832.3.464616.CTATCGCA-CTATCGCA.fastq.gz", - "type" : "nmdc:DataObject" } ] \ No newline at end of file diff --git a/tests/fixtures/nmdc_db/legacy_data_obj.json b/tests/fixtures/nmdc_db/legacy_data_obj.json new file mode 100644 index 00000000..579a35f6 --- /dev/null +++ b/tests/fixtures/nmdc_db/legacy_data_obj.json @@ -0,0 +1,15 @@ +[ + { + "id" : "nmdc:dobj-11-hnw52332", + "name" : "52832.3.464616.CTATCGCA-CTATCGCA.fastq.gz", + "description" : "Metagenome Raw Reads for nmdc:omprc-11-cegmwy02", + "alternative_identifiers" : [ + + ], + "file_size_bytes" : 31068664547, + "md5_checksum" : "12f380b91ff3364cd3d228505d3402b5", + "data_object_type" : "Metagenome Raw Reads", + "url" : "https://data.microbiomedata.org/data/nmdc:omprc-11-cegmwy02/nmdc:wfmsa-11-jc5cmf37.1/52832.3.464616.CTATCGCA-CTATCGCA.fastq.gz", + "type" : "nmdc:DataObject" +} +] \ No newline at end of file diff --git a/tests/test_workflow_process.py b/tests/test_workflow_process.py index f4c70852..690dcbf3 100644 --- a/tests/test_workflow_process.py +++ b/tests/test_workflow_process.py @@ -49,17 +49,22 @@ def test_load_workflow_process_nodes_does_not_load_metagenome_sequencing(test_db """ Test that legacy nmdc:MetagenomeSequencing instances are not loaded """ + exp_omprc = "nmdc:omprc-11-cegmwy02" reset_db(test_db) - load_fixture(test_db, "data_object_set.json") + load_fixture(test_db, "legacy_data_obj.json", "data_object_set") load_fixture(test_db, "legacy_data_generation.json", "data_generation_set") load_fixture(test_db, "metagenome_sequencing.json", "workflow_execution_set") wfs = load_workflow_configs(workflows_config_dir / "workflows.yaml") data_objs_by_id = get_required_data_objects_map(test_db, wfs) - wf_execs = get_current_workflow_process_nodes(test_db, wfs, data_objs_by_id, allowlist=["nmdc:omprc-11-cegmwy02",]) - # there should be no metagenome sequencing instances - assert not wf_execs - + wf_execs = get_current_workflow_process_nodes(test_db, wfs, data_objs_by_id, allowlist=[exp_omprc,]) + # We only expect the data generation to be loaded + assert wf_execs + assert len(wf_execs) == 1 + wf = wf_execs[0] + assert wf.type == "nmdc:NucleotideSequencing" + assert wf.id == exp_omprc + assert wf.was_informed_by == exp_omprc @mark.parametrize( From e4625a71953b3958eaf662c2067c4169a2adfaac Mon Sep 17 00:00:00 2001 From: Michael Thornton Date: Wed, 20 Nov 2024 15:50:00 -0800 Subject: [PATCH 3/4] update workflow process --- nmdc_automation/workflow_automation/workflow_process.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/nmdc_automation/workflow_automation/workflow_process.py b/nmdc_automation/workflow_automation/workflow_process.py index 6a68c237..330ef012 100644 --- a/nmdc_automation/workflow_automation/workflow_process.py +++ b/nmdc_automation/workflow_automation/workflow_process.py @@ -131,8 +131,9 @@ def get_current_workflow_process_nodes( records = db[wf.collection].find(q) for rec in records: - # if rec.get("type") == "nmdc:MetagenomeSequencing" or rec["name"].startswith("Metagenome Sequencing"): - # continue + # legacy JGI sequencing records + if rec.get("type") == "nmdc:MetagenomeSequencing" or rec["name"].startswith("Metagenome Sequencing"): + continue if wf.version and not _within_range(rec["version"], wf.version): continue if _is_missing_required_input_output(wf, rec, data_objects_by_id): From 4b32131c3e2505c2afe1ef0eba0b088d2384b1f8 Mon Sep 17 00:00:00 2001 From: Michael Thornton Date: Wed, 20 Nov 2024 15:52:22 -0800 Subject: [PATCH 4/4] remove comment --- nmdc_automation/workflow_automation/workflow_process.py | 1 - 1 file changed, 1 deletion(-) diff --git a/nmdc_automation/workflow_automation/workflow_process.py b/nmdc_automation/workflow_automation/workflow_process.py index 330ef012..f716526f 100644 --- a/nmdc_automation/workflow_automation/workflow_process.py +++ b/nmdc_automation/workflow_automation/workflow_process.py @@ -102,7 +102,6 @@ def get_current_workflow_process_nodes( data_generation_workflows = [wf for wf in workflows if wf.collection == "data_generation_set"] workflow_execution_workflows = [wf for wf in workflows if wf.collection == "workflow_execution_set"] - # screen out legacy metagenome sequencing workflows # default query for data_generation_set records filtered by analyte category q = {"analyte_category": analyte_category}