Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

293 skip metagenome sequencing #295

Merged
merged 4 commits into from
Nov 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions nmdc_automation/workflow_automation/workflow_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ def get_current_workflow_process_nodes(

data_generation_ids = set()
data_generation_workflows = [wf for wf in workflows if wf.collection == "data_generation_set"]

workflow_execution_workflows = [wf for wf in workflows if wf.collection == "workflow_execution_set"]

# default query for data_generation_set records filtered by analyte category
Expand Down Expand Up @@ -129,6 +130,9 @@ def get_current_workflow_process_nodes(

records = db[wf.collection].find(q)
for rec in records:
# legacy JGI sequencing records
if rec.get("type") == "nmdc:MetagenomeSequencing" or rec["name"].startswith("Metagenome Sequencing"):
continue
if wf.version and not _within_range(rec["version"], wf.version):
continue
if _is_missing_required_input_output(wf, rec, data_objects_by_id):
Expand Down
27 changes: 27 additions & 0 deletions tests/fixtures/nmdc_db/legacy_data_generation.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
[
{
"id" : "nmdc:omprc-11-cegmwy02",
"name" : "Terrestrial soil microbial communities - BONA_004-O-20210707-COMP",
"has_input" : [
"nmdc:procsm-11-d8hkca85"
],
"gold_sequencing_project_identifiers" : [
"gold:Gp0704890"
],
"processing_institution" : "JGI",
"type" : "nmdc:NucleotideSequencing",
"insdc_bioproject_identifiers" : [
"bioproject:PRJNA1029072"
],
"analyte_category" : "metagenome",
"associated_studies" : [
"nmdc:sty-11-34xj1150"
],
"instrument_used" : [
"nmdc:inst-14-mr4r2w09"
],
"has_output" : [
"nmdc:dobj-11-hnw52332"
]
}
]
15 changes: 15 additions & 0 deletions tests/fixtures/nmdc_db/legacy_data_obj.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
[
{
"id" : "nmdc:dobj-11-hnw52332",
"name" : "52832.3.464616.CTATCGCA-CTATCGCA.fastq.gz",
"description" : "Metagenome Raw Reads for nmdc:omprc-11-cegmwy02",
"alternative_identifiers" : [

],
"file_size_bytes" : 31068664547,
"md5_checksum" : "12f380b91ff3364cd3d228505d3402b5",
"data_object_type" : "Metagenome Raw Reads",
"url" : "https://data.microbiomedata.org/data/nmdc:omprc-11-cegmwy02/nmdc:wfmsa-11-jc5cmf37.1/52832.3.464616.CTATCGCA-CTATCGCA.fastq.gz",
"type" : "nmdc:DataObject"
}
]
19 changes: 19 additions & 0 deletions tests/fixtures/nmdc_db/metagenome_sequencing.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
[
{
"id" : "nmdc:wfmsa-11-jc5cmf37.1",
"name" : "Metagenome Sequencing Activity for nmdc:wfmsa-11-jc5cmf37.1",
"started_at_time" : "2023-09-13T19:57:49.595727+00:00",
"ended_at_time" : "2023-09-13T19:57:49.595743+00:00",
"was_informed_by" : "nmdc:omprc-11-cegmwy02",
"execution_resource" : "JGI",
"git_url" : "https://github.com/microbiomedata/RawSequencingData",
"has_input" : [
"nmdc:procsm-11-d8hkca85"
],
"has_output" : [
"nmdc:dobj-11-hnw52332"
],
"type" : "nmdc:MetagenomeSequencing",
"version" : "v1.0.0"
}
]
22 changes: 22 additions & 0 deletions tests/test_workflow_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,28 @@ def test_load_workflow_process_nodes(test_db, workflow_file, workflows_config_di
assert data_gen.children[0].type == "nmdc:ReadQcAnalysis"


def test_load_workflow_process_nodes_does_not_load_metagenome_sequencing(test_db, workflows_config_dir):
"""
Test that legacy nmdc:MetagenomeSequencing instances are not loaded
"""
exp_omprc = "nmdc:omprc-11-cegmwy02"
reset_db(test_db)
load_fixture(test_db, "legacy_data_obj.json", "data_object_set")
load_fixture(test_db, "legacy_data_generation.json", "data_generation_set")
load_fixture(test_db, "metagenome_sequencing.json", "workflow_execution_set")

wfs = load_workflow_configs(workflows_config_dir / "workflows.yaml")
data_objs_by_id = get_required_data_objects_map(test_db, wfs)
wf_execs = get_current_workflow_process_nodes(test_db, wfs, data_objs_by_id, allowlist=[exp_omprc,])
# We only expect the data generation to be loaded
assert wf_execs
assert len(wf_execs) == 1
wf = wf_execs[0]
assert wf.type == "nmdc:NucleotideSequencing"
assert wf.id == exp_omprc
assert wf.was_informed_by == exp_omprc


@mark.parametrize(
"workflow_file", ["workflows.yaml", "workflows-mt.yaml"]
)
Expand Down
Loading