From 782eb716f29c3f300569550ac641fbe1c4b9fba0 Mon Sep 17 00:00:00 2001 From: Shane Canon Date: Tue, 18 Jul 2023 12:31:47 -0700 Subject: [PATCH 1/4] Updates for scheduler - Adds dryrun - Adds skip and allow list through SKIPLISTFILE and ALLOWLISTFILE - Adds a force to require the full semantic version to match --- configs/workflows2.yaml | 6 +-- .../workflow_automation/activities.py | 29 ++++++----- nmdc_automation/workflow_automation/sched.py | 48 +++++++++++++++---- test_data/omics_processing_set.json | 31 ++++++++++++ test_data/read_QC_analysis_activity_set.json | 2 +- test_data/read_qc_analysis_activity_set2.json | 2 +- 6 files changed, 94 insertions(+), 24 deletions(-) create mode 100644 test_data/omics_processing_set.json diff --git a/configs/workflows2.yaml b/configs/workflows2.yaml index 3b733793..1003b52b 100644 --- a/configs/workflows2.yaml +++ b/configs/workflows2.yaml @@ -21,7 +21,7 @@ Workflows: Type: nmdc:ReadQcAnalysisActivity Enabled: True Git_repo: https://github.com/microbiomedata/ReadsQC - Version: b1.0.7 + Version: v1.0.7 WDL: rqcfilter.wdl Collection: read_qc_analysis_activity_set Filter Input Objects: @@ -56,7 +56,7 @@ Workflows: Type: nmdc:ReadQcAnalysisActivity Enabled: True Git_repo: https://github.com/microbiomedata/ReadsQC - Version: b1.1.8 + Version: v1.1.8 WDL: rqcfilter.wdl Collection: read_qc_analysis_activity_set Filter Input Objects: @@ -91,7 +91,7 @@ Workflows: Type: nmdc:ReadQcAnalysisActivity Enabled: True Git_repo: https://github.com/microbiomedata/ReadsQC - Version: b1.0.7 + Version: v1.0.7 Collection: read_qc_analysis_activity_set WDL: make_interleave_reads.wdl Input_prefix: make_interleaved_reads diff --git a/nmdc_automation/workflow_automation/activities.py b/nmdc_automation/workflow_automation/activities.py index 27560b64..0e6ab546 100644 --- a/nmdc_automation/workflow_automation/activities.py +++ b/nmdc_automation/workflow_automation/activities.py @@ -24,24 +24,28 @@ def _load_data_objects(db, workflows: List[Workflow]): def _check(match_types, data_object_ids, data_objs): + if not data_object_ids: + return False if not match_types or len(match_types) == 0: return True match_set = set(match_types) do_types = set() for doid in data_object_ids: - do_types.add(data_objs[doid].data_object_type) + if doid in data_objs: + do_types.add(data_objs[doid].data_object_type) return match_set.issubset(do_types) def _filter_skip(wf, rec, data_objs): match_in = _check(wf.filter_input_objects, - rec["has_input"], - data_objs) - match_out = _check(wf.filter_output_objects, - rec["has_output"], - data_objs) + rec.get("has_input"), + data_objs) + match_out = _check(wf.filter_output_objects, + rec.get("has_output"), + data_objs) return not (match_in and match_out) + def _read_acitivites(db, workflows: List[Workflow], data_objects: dict, filter: dict): """ @@ -53,6 +57,9 @@ def _read_acitivites(db, workflows: List[Workflow], q['git_url'] = wf.git_repo q['version'] = wf.version for rec in db[wf.collection].find(q): + if wf.collection == "omics_processing_set" and \ + rec["id"].startswith("gold"): + continue if _filter_skip(wf, rec, data_objects): continue act = Activity(rec, wf) @@ -90,8 +97,8 @@ def _resolve_relationships(activities, data_obj_act): # Let's make sure these came from the same source # This is just a safeguard if act.was_informed_by != parent_act.was_informed_by: - logging.warning("Mismatched informed by found for" - f"{do_id} in {act.id} ({act.name})") + logging.warning("Mismatched informed by for " + f"{do_id} in {act.id}") continue # We only want to use it as a parent if it is the right # parent workflow. Some inputs may come from ancestors @@ -100,11 +107,11 @@ def _resolve_relationships(activities, data_obj_act): # This is the one act.parent = parent_act parent_act.children.append(act) - logging.debug(f"Found parent: {parent_act.id} {parent_act.name}") + logging.debug(f"Found parent: {parent_act.id}" + f" {parent_act.name}") break if len(act.workflow.parents) > 0 and not act.parent: - logging.warning("Didn't find a parent for " - f"{act.id} ({act.name}) - {act.workflow.name}") + logging.warning(f"Didn't find a parent for {act.id}") # Now all the activities have their parent return activities diff --git a/nmdc_automation/workflow_automation/sched.py b/nmdc_automation/workflow_automation/sched.py index 683dfea6..01e6cb31 100644 --- a/nmdc_automation/workflow_automation/sched.py +++ b/nmdc_automation/workflow_automation/sched.py @@ -24,13 +24,15 @@ def get_mongo_db() -> MongoDatabase: raise KeyError(f"Missing MONGO_{k}") _client = MongoClient( host=os.getenv("MONGO_HOST"), + port=int(os.getenv("MONGO_PORT", "27017")), username=os.getenv("MONGO_USERNAME"), password=os.getenv("MONGO_PASSWORD"), + directConnection=True, ) return _client[os.getenv("MONGO_DBNAME")] -def within_range(wf1: Workflow, wf2: Workflow) -> bool: +def within_range(wf1: Workflow, wf2: Workflow, force=False) -> bool: """ Determine if two workflows are within a major and minor version of each other. @@ -44,6 +46,8 @@ def get_version(wf): return False v1 = get_version(wf1) v2 = get_version(wf2) + if force: + return v1==v2 if v1.major == v2.major and v1.minor == v2.minor: return True return False @@ -83,6 +87,10 @@ def __init__(self, db, wfn="workflows.yaml"): self.workflows = load_workflows(wf_file) self.db = db self.api = nmdcapi() + self.force = False + if os.environ.get("FORCE") == "1": + logging.info("Setting force on") + self.force = True async def run(self): logging.info("Starting Scheduler") @@ -160,7 +168,7 @@ def add_job_rec(self, job: Job): "config": job_config, "claims": [] } - self.db.jobs.insert_one(jr, bypass_document_validation=True) + self.db.jobs.insert_one(jr) logging.info(f'JOB RECORD: {jr["id"]}') # This would make the job record # print(json.dumps(ji, indent=2)) @@ -243,18 +251,18 @@ def find_new_jobs(self, act: Activity) -> list[Job]: # Look at previously generated derived # activities to see if this is already done. for child_act in act.children: - if within_range(child_act.workflow, wf): + if within_range(child_act.workflow, wf, force=self.force): break else: # These means no existing activities were # found that matched this workflow, so we # add a job - logging.debug(f"Creating a job {wf.name} for {act.id}") + logging.debug(f"Creating a job {wf.name}:{wf.version} for {act.id}") new_jobs.append(Job(wf, act)) return new_jobs - def cycle(self) -> list: + def cycle(self, dryrun: bool = False, skiplist: set = set(), allowlist = None) -> list: """ This function does a single cycle of looking for new jobs """ @@ -262,8 +270,18 @@ def cycle(self) -> list: self.get_existing_jobs.cache_clear() job_recs = [] for act in acts: + if act.was_informed_by in skiplist: + logging.debug(f"Skipping: {act.was_informed_by}") + continue + if allowlist and act.was_informed_by not in allowlist: + continue jobs = self.find_new_jobs(act) for job in jobs: + if dryrun: + msg = f"new job: informed_by: {job.informed_by} trigger: {job.trigger_id} " + msg += f"wf: {job.workflow.name}" + logging.info(msg) + continue try: jr = self.add_job_rec(job) if jr: @@ -274,16 +292,30 @@ def cycle(self) -> list: return job_recs -def main(): +def main(): # pragma: no cover """ Main function """ sched = Scheduler(get_mongo_db()) + dryrun = False + if os.environ.get("DRYRUN") == "1": + dryrun = True + skiplist = set() + allowlist = None + if os.environ.get("SKIPLISTFILE"): + with open(os.environ.get("SKIPLISTFILE")) as f: + for line in f: + skiplist.add(line.rstrip()) + if os.environ.get("ALLOWLISTFILE"): + allowlist = set() + with open(os.environ.get("ALLOWLISTFILE")) as f: + for line in f: + allowlist.add(line.rstrip()) while True: - sched.cycle() + sched.cycle(dryrun=dryrun, skiplist=skiplist, allowlist=allowlist) _sleep(_POLL_INTERVAL) -if __name__ == "__main__": +if __name__ == "__main__": # pragma: no cover logging.basicConfig(level=logging.INFO) main() diff --git a/test_data/omics_processing_set.json b/test_data/omics_processing_set.json new file mode 100644 index 00000000..9e0943a8 --- /dev/null +++ b/test_data/omics_processing_set.json @@ -0,0 +1,31 @@ +[ + { + "id": "nmdc:omprc-11-nhy4pz43", + "name": "Core terrestrial soil microbial communities from Talladega National Forest, Ozarks Complex, AL, USA - TALL_002-O-10-34-20140708-GEN-DNA1", + "has_input": [ + "nmdc:bsm-11-7qhhd037" + ], + "has_output": [ + "nmdc:22afa3d49b73eaec2e9787a6b88fbdc3" + ], + "add_date": "2020-01-27T00:00:00", + "mod_date": "2020-01-27T00:00:00", + "instrument_name": "Illumina HiSeq", + "ncbi_project_name": "Core terrestrial soil microbial communities from Talladega National Forest, Ozarks Complex, AL, USA - TALL_002-O-10-34-20140708-GEN-DNA1", + "omics_type": { + "has_raw_value": "Metagenome" + }, + "part_of": [ + "nmdc:sty-11-34xj1150" + ], + "principal_investigator": { + "has_raw_value": "Lee Stanish", + "email": "lstanish@gmail.com", + "name": "Lee Stanish" + }, + "type": "nmdc:OmicsProcessing", + "gold_sequencing_project_identifiers": [ + "GOLD:Gp0477109" + ] + } +] diff --git a/test_data/read_QC_analysis_activity_set.json b/test_data/read_QC_analysis_activity_set.json index e37dcc6e..9d6740e8 100644 --- a/test_data/read_QC_analysis_activity_set.json +++ b/test_data/read_QC_analysis_activity_set.json @@ -7,7 +7,7 @@ "nmdc:mga0vx38" ], "git_url": "https://github.com/microbiomedata/ReadsQC", - "version": "b1.0.7", + "version": "v1.0.7", "has_output": [ "nmdc:f107af0a000ec0b90e157fc09473c337", "nmdc:71528f677698dd6657ea7ddcc3105184" diff --git a/test_data/read_qc_analysis_activity_set2.json b/test_data/read_qc_analysis_activity_set2.json index 076cd272..ce9c618d 100644 --- a/test_data/read_qc_analysis_activity_set2.json +++ b/test_data/read_qc_analysis_activity_set2.json @@ -4,7 +4,7 @@ "nmdc:22afa3d49b73eaec2e9787a6b88fbdc3" ], "git_url": "https://github.com/microbiomedata/ReadsQC", - "version": "b1.1.8", + "version": "v1.1.8", "has_output": [ "nmdc:f107af0a000ec0b90e157fc09473c337v2", "nmdc:71528f677698dd6657ea7ddcc3105184v2" From 773f176f7b540b628e9ff41469266d501bd16fd0 Mon Sep 17 00:00:00 2001 From: Shane Canon Date: Tue, 18 Jul 2023 14:30:47 -0700 Subject: [PATCH 2/4] Fix bad merge in workflows.yaml and fix Dockerfile --- Dockerfile | 11 +++++++--- configs/workflows.yaml | 47 ------------------------------------------ 2 files changed, 8 insertions(+), 50 deletions(-) diff --git a/Dockerfile b/Dockerfile index 390bd56d..9158da2b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,10 +1,15 @@ FROM python:3.9 -ADD requirements.txt /tmp/requirements.txt +RUN \ + pip install poetry && \ + poetry config virtualenvs.create false -RUN pip install -r /tmp/requirements.txt +ADD pyproject.toml poetry.lock README.md /src/ +WORKDIR /src +RUN \ + poetry install --only=main --no-root +RUN pip install semver ADD . /src -WORKDIR /src diff --git a/configs/workflows.yaml b/configs/workflows.yaml index 027a033b..fa407a58 100644 --- a/configs/workflows.yaml +++ b/configs/workflows.yaml @@ -44,17 +44,14 @@ Workflows: Outputs: - output: filtered_final name: Reads QC result fastq (clean data) - suffix: "_filtered.fastq.gz" data_object_type: Filtered Sequencing Reads description: "Reads QC for {id}" - output: filtered_stats_final name: Reads QC summary statistics - suffix: "_filterStats.txt" data_object_type: QC Statistics description: "Reads QC summary for {id}" - output: rqc_info name: File containing read filtering information - suffix: "_readsQC.info" data_object_type: Read Filtering Info File description: "Read filtering info for {id}" @@ -86,17 +83,14 @@ Workflows: Outputs: - output: filtered_final name: Reads QC result fastq (clean data) - suffix: "_filtered.fastq.gz" data_object_type: Filtered Sequencing Reads description: "Reads QC for {id}" - output: filtered_stats_final name: Reads QC summary statistics - suffix: "_filterStats.txt" data_object_type: QC Statistics description: "Reads QC summary for {id}" - output: rqc_info name: File containing read filtering information - suffix: "_readsQC.info" data_object_type: Read Filtering Info File description: "Read filtering info for {id}" @@ -146,32 +140,26 @@ Workflows: Outputs: - output: contig name: Final assembly contigs fasta - suffix: "_contigs.fna" data_object_type: Assembly Contigs description: "Assembly contigs for {id}" - output: scaffold name: Final assembly scaffolds fasta - suffix: "_scaffolds.fna" data_object_type: Assembly Scaffolds description: "Assembly scaffolds for {id}" - output: covstats name: Assembled contigs coverage information - suffix: "_covstats.txt" data_object_type: Assembly Coverage Stats description: "Coverage Stats for {id}" - output: agp name: An AGP format file that describes the assembly - suffix: "_assembly.agp" data_object_type: Assembly AGP description: "AGP for {id}" - output: bam name: Sorted bam file of reads mapping back to the final assembly - suffix: "_pairedMapped_sorted.bam" data_object_type: Assembly Coverage BAM description: "Sorted Bam for {id}" - output: asminfo name: File containing assembly info - suffix: "_metaAsm.info" data_object_type: Assembly Info File description: "Assembly info for {id}" @@ -197,22 +185,18 @@ Workflows: data_object_type: Annotation Amino Acid FASTA description: FASTA Amino Acid File for {id} name: FASTA amino acid file for annotated proteins - suffix: _proteins.faa - output: structural_gff data_object_type: Structural Annotation GFF description: Structural Annotation for {id} name: GFF3 format file with structural annotations - suffix: _structural_annotation.gff - output: functional_gff data_object_type: Functional Annotation GFF description: Functional Annotation for {id} name: GFF3 format file with functional annotations - suffix: _functional_annotation.gff - output: ko_tsv data_object_type: Annotation KEGG Orthology description: KEGG Orthology for {id} name: Tab delimited file for KO annotation - suffix: _ko.tsv - output: ec_tsv data_object_type: Annotation Enzyme Commission description: EC Annotations for {id} @@ -227,92 +211,74 @@ Workflows: data_object_type: Clusters of Orthologous Groups (COG) Annotation GFF description: COGs for {id} name: GFF3 format file with COGs - suffix: _cog.gff - output: pfam_gff data_object_type: Pfam Annotation GFF description: Pfam Annotation for {id} name: GFF3 format file with Pfam - suffix: _pfam.gff - output: tigrfam_gff data_object_type: TIGRFam Annotation GFF description: TIGRFam for {id} name: GFF3 format file with TIGRfam - suffix: _tigrfam.gff - output: smart_gff data_object_type: SMART Annotation GFF description: SMART Annotations for {id} name: GFF3 format file with SMART - suffix: _smart.gff - output: supfam_gff data_object_type: SUPERFam Annotation GFF description: SUPERFam Annotations for {id} name: GFF3 format file with SUPERFam - suffix: _supfam.gff - output: cath_funfam_gff data_object_type: CATH FunFams (Functional Families) Annotation GFF description: CATH FunFams for {id} name: GFF3 format file with CATH FunFams - suffix: _cath_funfam.gff - output: crt_gff data_object_type: CRT Annotation GFF description: CRT Annotations for {id} name: GFF3 format file with CRT - suffix: _crt.gff - output: genemark_gff data_object_type: Genmark Annotation GFF description: Genemark Annotations for {id} name: GFF3 format file with Genemark - suffix: _genemark.gff - output: prodigal_gff data_object_type: Prodigal Annotation GFF description: Prodigal Annotations {id} name: GFF3 format file with Prodigal - suffix: _prodigal.gff - output: trna_gff data_object_type: TRNA Annotation GFF description: TRNA Annotations {id} name: GFF3 format file with TRNA - suffix: _trna.gff - output: final_rfam_gff data_object_type: RFAM Annotation GFF description: RFAM Annotations for {id} name: GFF3 format file with RFAM - suffix: _rfam.gff - output: ko_ec_gff data_object_type: KO_EC Annotation GFF description: KO_EC Annotations for {id} name: GFF3 format file with KO_EC - suffix: _ko_ec.gff - output: product_names_tsv data_object_type: Product Names description: Product names for {id} name: Product names file - suffix: _product_names.tsv - output: gene_phylogeny_tsv data_object_type: Gene Phylogeny tsv description: Gene Phylogeny for {id} name: Gene Phylogeny file - suffix: _gene_phylogeny.tsv - output: crt_crisprs data_object_type: Crisprt Terms description: Crispr Terms for {id} name: Crispr Terms - suffix: _crt.crisprs - output: stats_tsv data_object_type: Annotation Statistics description: Annotation Stats for {id} name: Annotation statistics report - suffix: _stats.tsv - output: contig_mapping data_object_type: Contig Mapping File description: Conging mappings file for {id} name: Contig mappings between contigs and scaffolds - suffix: _contig_names_mapping.tsv - output: imgap_version data_object_type: Annotation Info File description: Annotation info for {id} name: File containing annotation info - suffix: _imgap.info - Name: MAGs Type: nmdc:MAGsAnalysisActivity @@ -350,17 +316,14 @@ Workflows: data_object_type: CheckM Statistics description: CheckM for {id} name: CheckM statistics report - suffix: _checkm_qa.out - output: final_hqmq_bins_zip data_object_type: Metagenome Bins description: Metagenome Bins for {id} name: Metagenome bin tarfiles archive - suffix: _hqmq_bin.zip - output: final_gtdbtk_bac_summary data_object_type: GTDBTK Bacterial Summary description: Bacterial Summary for {id} name: GTDBTK bacterial summary - suffix: _gtdbtk.bac122.summary.tsv - output: final_gtdbtk_ar_summary data_object_type: GTDBTK Archaeal Summary description: Archaeal Summary for {id} @@ -370,7 +333,6 @@ Workflows: data_object_tye: Metagenome Bins Info File description: Metagenome Bins Info File for {id} name: Metagenome Bins Info File - suffix: _bin.info - Name: Readbased Analysis Type: nmdc:ReadBasedTaxonomyAnalysisActivity @@ -393,47 +355,38 @@ Workflows: data_object_type: GOTTCHA2 Classification Report description: GOTTCHA2 Classification for {id} name: GOTTCHA2 classification report file - suffix: _gottcha2_report.tsv - output: final_gottcha2_full_tsv data_object_type: GOTTCHA2 Report Full description: GOTTCHA2 Full Report for {id} name: GOTTCHA2 report file - suffix: _gottcha2_full_tsv - output: final_gottcha2_krona_html data_object_type: GOTTCHA2 Krona Plot description: GOTTCHA2 Krona for {id} name: GOTTCHA2 krona plot HTML file - suffix: _gottcha2_krona.html - output: final_centrifuge_classification_tsv data_object_type: Centrifuge Taxonomic Classification description: Centrifuge Classification for {id} name: Centrifuge output read classification file - suffix: _centrifuge_classification.tsv - output: final_centrifuge_report_tsv data_object_type: Centrifuge output report file description: Centrifuge Report for {id} name: Centrifuge Classification Report - suffix: _centrifuge_report.tsv - output: final_centrifuge_krona_html data_object_type: Centrifuge Krona Plot description: Centrifuge Krona for {id} name: Centrifug krona plot HTML file - suffix: _centrifuge_krona.html - output: final_kraken2_classification_tsv data_object_type: Kraken2 Taxonomic Classification description: Kraken2 Classification for {id} name: Kraken2 output read classification file - suffix: _kraken2_classification.tsv - output: final_kraken2_report_tsv data_object_type: Kraken2 Classification Report description: Kraken2 Report for {id} name: Kraken2 output report file - suffix: _kraken2_report.tsv - output: final_kraken2_krona_html data_object_type: Kraken2 Krona Plot description: Kraken2 Krona for {id} name: Kraken2 Krona plot HTML file - suffix: _kraken2_krona.html - output: info_file data_object_type: Read Based Analysis Info File description: Read based analysis info for {id} From d6684f2a9f43742e9ba3a2a00ea22d950c7ab2fa Mon Sep 17 00:00:00 2001 From: Shane Canon Date: Thu, 26 Oct 2023 17:22:46 -0700 Subject: [PATCH 3/4] Sync up with other version --- configs/workflows2.yaml | 60 +++++------------------------------------ 1 file changed, 7 insertions(+), 53 deletions(-) diff --git a/configs/workflows2.yaml b/configs/workflows2.yaml index 1003b52b..eb1204d5 100644 --- a/configs/workflows2.yaml +++ b/configs/workflows2.yaml @@ -43,12 +43,10 @@ Workflows: Outputs: - output: filtered_final name: Reads QC result fastq (clean data) - suffix: "_filtered.fastq.gz" data_object_type: Filtered Sequencing Reads description: "Reads QC for {id}" - output: filtered_stats_final name: Reads QC summary statistics - suffix: "_filterStats.txt" data_object_type: QC Statistics description: "Reads QC summary for {id}" @@ -78,12 +76,10 @@ Workflows: Outputs: - output: filtered_final name: Reads QC result fastq (clean data) - suffix: "_filtered.fastq.gz" data_object_type: Filtered Sequencing Reads description: "Reads QC for {id}" - output: filtered_stats_final name: Reads QC summary statistics - suffix: "_filterStats.txt" data_object_type: QC Statistics description: "Reads QC summary for {id}" @@ -98,10 +94,10 @@ Workflows: Inputs: proj: "{activity_id}" input_file_1: do:Metagenome Raw Read 1 - input_file_1: do:Metagenome Raw Read 2 + input_file_2: do:Metagenome Raw Read 2 Filter Input Objects: - - Metagenome Raw Reads 1 - - Metagenome Raw Reads 2 + - Metagenome Raw Read 1 + - Metagenome Raw Read 2 Predecessors: - Sequencing Noninterleaved Input_prefix: nmdc_rqcfilter @@ -115,17 +111,14 @@ Workflows: Outputs: - output: filtered_final name: Reads QC result fastq (clean data) - suffix: "_filtered.fastq.gz" data_object_type: Filtered Sequencing Reads description: "Reads QC for {id}" - output: filtered_stats_final name: Reads QC summary statistics - suffix: "_filterStats.txt" data_object_type: QC Statistics description: "Reads QC summary for {id}" - output: rqc_info name: File containing read filtering information - suffix: "_readsQC.info" data_object_type: Read Filtering Info File description: "Read filtering info for {id}" @@ -133,7 +126,7 @@ Workflows: Type: nmdc:MetagenomeAssembly Enabled: True Git_repo: https://github.com/microbiomedata/metaAssembly - Version: v1.0.4-beta + Version: v1.0.3 WDL: jgi_assembly.wdl Collection: metagenome_assembly_set Predecessors: @@ -175,27 +168,22 @@ Workflows: Outputs: - output: contig name: Final assembly contigs fasta - suffix: "_contigs.fna" data_object_type: Assembly Contigs description: "Assembly contigs for {id}" - output: scaffold name: Final assembly scaffolds fasta - suffix: "_scaffolds.fna" data_object_type: Assembly Scaffolds description: "Assembly scaffolds for {id}" - output: covstats name: Assembled contigs coverage information - suffix: "_covstats.txt" data_object_type: Assembly Coverage Stats description: "Coverage Stats for {id}" - output: agp name: An AGP format file that describes the assembly - suffix: "_assembly.agp" data_object_type: Assembly AGP description: "AGP for {id}" - output: bam name: Sorted bam file of reads mapping back to the final assembly - suffix: "_pairedMapped_sorted.bam" data_object_type: Assembly Coverage BAM description: "Sorted Bam for {id}" @@ -203,7 +191,7 @@ Workflows: Type: nmdc:MetagenomeAnnotationActivity Enabled: True Git_repo: https://github.com/microbiomedata/mg_annotation - Version: v1.0.2-beta + Version: v1.0.4 WDL: annotation_full.wdl Collection: metagenome_annotation_activity_set Predecessors: @@ -221,114 +209,93 @@ Workflows: data_object_type: Annotation Amino Acid FASTA description: FASTA Amino Acid File for {id} name: FASTA amino acid file for annotated proteins - suffix: _proteins.faa - output: structural_gff data_object_type: Structural Annotation GFF description: Structural Annotation for {id} name: GFF3 format file with structural annotations - suffix: _structural_annotation.gff - output: functional_gff data_object_type: Functional Annotation GFF description: Functional Annotation for {id} name: GFF3 format file with functional annotations - suffix: _functional_annotation.gff - output: ko_tsv data_object_type: Annotation KEGG Orthology description: KEGG Orthology for {id} name: Tab delimited file for KO annotation - suffix: _ko.tsv - output: ec_tsv data_object_type: Annotation Enzyme Commission description: EC Annotations for {id} name: Tab delimited file for EC annotation - suffix: _ec.tsv - output: cog_gff data_object_type: Clusters of Orthologous Groups (COG) Annotation GFF description: COGs for {id} name: GFF3 format file with COGs - suffix: _cog.gff - output: pfam_gff data_object_type: Pfam Annotation GFF description: Pfam Annotation for {id} name: GFF3 format file with Pfam - suffix: _pfam.gff - output: tigrfam_gff data_object_type: TIGRFam Annotation GFF description: TIGRFam for {id} name: GFF3 format file with TIGRfam - suffix: _tigrfam.gff - output: smart_gff data_object_type: SMART Annotation GFF description: SMART Annotations for {id} name: GFF3 format file with SMART - suffix: _smart.gff - output: supfam_gff data_object_type: SUPERFam Annotation GFF description: SUPERFam Annotations for {id} name: GFF3 format file with SUPERFam - suffix: _supfam.gff - output: cath_funfam_gff data_object_type: CATH FunFams (Functional Families) Annotation GFF description: CATH FunFams for {id} name: GFF3 format file with CATH FunFams - suffix: _cath_funfam.gff - output: crt_gff data_object_type: CRT Annotation GFF description: CRT Annotations for {id} name: GFF3 format file with CRT - suffix: _crt.gff - output: genemark_gff data_object_type: Genmark Annotation GFF description: Genemark Annotations for {id} name: GFF3 format file with Genemark - suffix: _genemark.gff - output: prodigal_gff data_object_type: Prodigal Annotation GFF description: Prodigal Annotations {id} name: GFF3 format file with Prodigal - suffix: _prodigal.gff - output: trna_gff data_object_type: TRNA Annotation GFF description: TRNA Annotations {id} name: GFF3 format file with TRNA - suffix: _trna.gff - output: final_rfam_gff data_object_type: RFAM Annotation GFF description: RFAM Annotations for {id} name: GFF3 format file with RFAM - suffix: _rfam.gff - output: ko_ec_gff data_object_type: KO_EC Annotation GFF description: KO_EC Annotations for {id} name: GFF3 format file with KO_EC - suffix: _ko_ec.gff - output: product_names_tsv data_object_type: Product Names description: Product names for {id} name: Product names file - suffix: _product_names.tsv - output: gene_phylogeny_tsv data_object_type: Gene Phylogeny tsv description: Gene Phylogeny for {id} name: Gene Phylogeny file - suffix: _gene_phylogeny.tsv - output: crt_crisprs data_object_type: Crisprt Terms description: Crispr Terms for {id} name: Crispr Terms - suffix: _crt.crisprs - output: stats_tsv data_object_type: Annotation Statistics description: Annotation Stats for {id} name: Annotation statistics report - suffix: _stats.tsv - Name: MAGs Type: nmdc:MAGsAnalysisActivity Enabled: True Git_repo: https://github.com/microbiomedata/mg_annotation Git_repo: https://github.com/microbiomedata/metaMAGs - Version: v1.0.5-beta + Version: v1.0.6 WDL: mbin_nmdc.wdl Collection: mags_activity_set Predecessors: @@ -359,28 +326,24 @@ Workflows: data_object_type: CheckM Statistics description: CheckM for {id} name: CheckM statistics report - suffix: _checkm_qa.out - output: final_hqmq_bins_zip data_object_type: Metagenome Bins description: Metagenome Bins for {id} name: Metagenome bin tarfiles archive - suffix: _hqmq_bin.zip - output: final_gtdbtk_bac_summary data_object_type: GTDBTK Bacterial Summary description: Bacterial Summary for {id} name: GTDBTK bacterial summary - suffix: _gtdbtk.bac122.summary.tsv - output: final_gtdbtk_ar_summary data_object_type: GTDBTK Archaeal Summary description: Archaeal Summary for {id} name: GTDBTK archaeal summary - suffix: _gtdbtk.ar122.summary.tsv - Name: Readbased Analysis Type: nmdc:ReadBasedTaxonomyAnalysisActivity Enabled: True Git_repo: https://github.com/microbiomedata/ReadbasedAnalysis - Version: v1.0.5-beta + Version: v1.0.5 WDL: ReadbasedAnalysis.wdl Collection: read_based_taxonomy_analysis_activity_set Predecessors: @@ -397,45 +360,36 @@ Workflows: data_object_type: GOTTCHA2 Classification Report description: GOTTCHA2 Classification for {id} name: GOTTCHA2 classification report file - suffix: _gottcha2_report.tsv - output: final_gottcha2_full_tsv data_object_type: GOTTCHA2 Report Full description: GOTTCHA2 Full Report for {id} name: GOTTCHA2 report file - suffix: _gottcha2_full_tsv - output: final_gottcha2_krona_html data_object_type: GOTTCHA2 Krona Plot description: GOTTCHA2 Krona for {id} name: GOTTCHA2 krona plot HTML file - suffix: _gottcha2_krona.html - output: final_centrifuge_classification_tsv data_object_type: Centrifuge Taxonomic Classification description: Centrifuge Classification for {id} name: Centrifuge output read classification file - suffix: _centrifuge_classification.tsv - output: final_centrifuge_report_tsv data_object_type: Centrifuge output report file description: Centrifuge Report for {id} name: Centrifuge Classification Report - suffix: _centrifuge_report.tsv - output: final_centrifuge_krona_html data_object_type: Centrifuge Krona Plot description: Centrifuge Krona for {id} name: Centrifug krona plot HTML file - suffix: _centrifuge_krona.html - output: final_kraken2_classification_tsv data_object_type: Kraken2 Taxonomic Classification description: Kraken2 Classification for {id} name: Kraken2 output read classification file - suffix: _kraken2_classification.tsv - output: final_kraken2_report_tsv data_object_type: Kraken2 Classification Report description: Kraken2 Report for {id} name: Kraken2 output report file - suffix: _kraken2_report.tsv - output: final_kraken2_krona_html data_object_type: Kraken2 Krona Plot description: Kraken2 Krona for {id} name: Kraken2 Krona plot HTML file - suffix: _kraken2_krona.html From a736b084c41b5b1c0d364ef9a24e73ff9d69c286 Mon Sep 17 00:00:00 2001 From: Shane Canon Date: Thu, 26 Oct 2023 17:23:14 -0700 Subject: [PATCH 4/4] Adding logging --- nmdc_automation/workflow_automation/activities.py | 1 + 1 file changed, 1 insertion(+) diff --git a/nmdc_automation/workflow_automation/activities.py b/nmdc_automation/workflow_automation/activities.py index 0e6ab546..f1580f46 100644 --- a/nmdc_automation/workflow_automation/activities.py +++ b/nmdc_automation/workflow_automation/activities.py @@ -53,6 +53,7 @@ def _read_acitivites(db, workflows: List[Workflow], """ activities = [] for wf in workflows: + logging.debug(f"Checking {wf.name}:{wf.version}") q = filter q['git_url'] = wf.git_repo q['version'] = wf.version