From 782eb716f29c3f300569550ac641fbe1c4b9fba0 Mon Sep 17 00:00:00 2001
From: Shane Canon <scanon@lbl.gov>
Date: Tue, 18 Jul 2023 12:31:47 -0700
Subject: [PATCH 1/4] Updates for scheduler

- Adds dryrun
- Adds skip and allow list through SKIPLISTFILE and ALLOWLISTFILE
- Adds a force to require the full semantic version to match
---
 configs/workflows2.yaml                       |  6 +--
 .../workflow_automation/activities.py         | 29 ++++++-----
 nmdc_automation/workflow_automation/sched.py  | 48 +++++++++++++++----
 test_data/omics_processing_set.json           | 31 ++++++++++++
 test_data/read_QC_analysis_activity_set.json  |  2 +-
 test_data/read_qc_analysis_activity_set2.json |  2 +-
 6 files changed, 94 insertions(+), 24 deletions(-)
 create mode 100644 test_data/omics_processing_set.json

diff --git a/configs/workflows2.yaml b/configs/workflows2.yaml
index 3b733793..1003b52b 100644
--- a/configs/workflows2.yaml
+++ b/configs/workflows2.yaml
@@ -21,7 +21,7 @@ Workflows:
     Type: nmdc:ReadQcAnalysisActivity
     Enabled: True
     Git_repo: https://github.com/microbiomedata/ReadsQC
-    Version: b1.0.7
+    Version: v1.0.7
     WDL: rqcfilter.wdl
     Collection: read_qc_analysis_activity_set
     Filter Input Objects:
@@ -56,7 +56,7 @@ Workflows:
     Type: nmdc:ReadQcAnalysisActivity
     Enabled: True
     Git_repo: https://github.com/microbiomedata/ReadsQC
-    Version: b1.1.8
+    Version: v1.1.8
     WDL: rqcfilter.wdl
     Collection: read_qc_analysis_activity_set
     Filter Input Objects:
@@ -91,7 +91,7 @@ Workflows:
     Type: nmdc:ReadQcAnalysisActivity
     Enabled: True
     Git_repo: https://github.com/microbiomedata/ReadsQC
-    Version: b1.0.7
+    Version: v1.0.7
     Collection: read_qc_analysis_activity_set
     WDL: make_interleave_reads.wdl
     Input_prefix: make_interleaved_reads
diff --git a/nmdc_automation/workflow_automation/activities.py b/nmdc_automation/workflow_automation/activities.py
index 27560b64..0e6ab546 100644
--- a/nmdc_automation/workflow_automation/activities.py
+++ b/nmdc_automation/workflow_automation/activities.py
@@ -24,24 +24,28 @@ def _load_data_objects(db, workflows: List[Workflow]):
 
 
 def _check(match_types, data_object_ids, data_objs):
+    if not data_object_ids:
+        return False
     if not match_types or len(match_types) == 0:
         return True
     match_set = set(match_types)
     do_types = set()
     for doid in data_object_ids:
-        do_types.add(data_objs[doid].data_object_type)
+        if doid in data_objs:
+            do_types.add(data_objs[doid].data_object_type)
     return match_set.issubset(do_types)
 
 
 def _filter_skip(wf, rec, data_objs):
     match_in = _check(wf.filter_input_objects,
-                   rec["has_input"],
-                   data_objs)
-    match_out =  _check(wf.filter_output_objects,
-                   rec["has_output"],
-                   data_objs)
+                      rec.get("has_input"),
+                      data_objs)
+    match_out = _check(wf.filter_output_objects,
+                       rec.get("has_output"),
+                       data_objs)
     return not (match_in and match_out)
 
+
 def _read_acitivites(db, workflows: List[Workflow],
                      data_objects: dict, filter: dict):
     """
@@ -53,6 +57,9 @@ def _read_acitivites(db, workflows: List[Workflow],
         q['git_url'] = wf.git_repo
         q['version'] = wf.version
         for rec in db[wf.collection].find(q):
+            if wf.collection == "omics_processing_set" and \
+               rec["id"].startswith("gold"):
+                continue
             if _filter_skip(wf, rec, data_objects):
                 continue
             act = Activity(rec, wf)
@@ -90,8 +97,8 @@ def _resolve_relationships(activities, data_obj_act):
             # Let's make sure these came from the same source
             # This is just a safeguard
             if act.was_informed_by != parent_act.was_informed_by:
-                logging.warning("Mismatched informed by found for"
-                                f"{do_id} in {act.id} ({act.name})")
+                logging.warning("Mismatched informed by for "
+                                f"{do_id} in {act.id}")
                 continue
             # We only want to use it as a parent if it is the right
             # parent workflow. Some inputs may come from ancestors
@@ -100,11 +107,11 @@ def _resolve_relationships(activities, data_obj_act):
                 # This is the one
                 act.parent = parent_act
                 parent_act.children.append(act)
-                logging.debug(f"Found parent: {parent_act.id} {parent_act.name}")
+                logging.debug(f"Found parent: {parent_act.id}"
+                              f" {parent_act.name}")
                 break
         if len(act.workflow.parents) > 0 and not act.parent:
-            logging.warning("Didn't find a parent for "
-                            f"{act.id} ({act.name}) - {act.workflow.name}")
+            logging.warning(f"Didn't find a parent for {act.id}")
     # Now all the activities have their parent
     return activities
 
diff --git a/nmdc_automation/workflow_automation/sched.py b/nmdc_automation/workflow_automation/sched.py
index 683dfea6..01e6cb31 100644
--- a/nmdc_automation/workflow_automation/sched.py
+++ b/nmdc_automation/workflow_automation/sched.py
@@ -24,13 +24,15 @@ def get_mongo_db() -> MongoDatabase:
             raise KeyError(f"Missing MONGO_{k}")
     _client = MongoClient(
         host=os.getenv("MONGO_HOST"),
+        port=int(os.getenv("MONGO_PORT", "27017")),
         username=os.getenv("MONGO_USERNAME"),
         password=os.getenv("MONGO_PASSWORD"),
+        directConnection=True,
     )
     return _client[os.getenv("MONGO_DBNAME")]
 
 
-def within_range(wf1: Workflow, wf2: Workflow) -> bool:
+def within_range(wf1: Workflow, wf2: Workflow, force=False) -> bool:
     """
     Determine if two workflows are within a major and minor
     version of each other.
@@ -44,6 +46,8 @@ def get_version(wf):
         return False
     v1 = get_version(wf1)
     v2 = get_version(wf2)
+    if force:
+        return v1==v2
     if v1.major == v2.major and v1.minor == v2.minor:
         return True
     return False
@@ -83,6 +87,10 @@ def __init__(self, db, wfn="workflows.yaml"):
         self.workflows = load_workflows(wf_file)
         self.db = db
         self.api = nmdcapi()
+        self.force = False
+        if os.environ.get("FORCE") == "1":
+            logging.info("Setting force on")
+            self.force = True
 
     async def run(self):
         logging.info("Starting Scheduler")
@@ -160,7 +168,7 @@ def add_job_rec(self, job: Job):
             "config": job_config,
             "claims": []
         }
-        self.db.jobs.insert_one(jr, bypass_document_validation=True)
+        self.db.jobs.insert_one(jr)
         logging.info(f'JOB RECORD: {jr["id"]}')
         # This would make the job record
         # print(json.dumps(ji, indent=2))
@@ -243,18 +251,18 @@ def find_new_jobs(self, act: Activity) -> list[Job]:
             # Look at previously generated derived
             # activities to see if this is already done.
             for child_act in act.children:
-                if within_range(child_act.workflow, wf):
+                if within_range(child_act.workflow, wf, force=self.force):
                     break
             else:
                 # These means no existing activities were
                 # found that matched this workflow, so we
                 # add a job
-                logging.debug(f"Creating a job {wf.name} for {act.id}")
+                logging.debug(f"Creating a job {wf.name}:{wf.version} for {act.id}")
                 new_jobs.append(Job(wf, act))
 
         return new_jobs
 
-    def cycle(self) -> list:
+    def cycle(self, dryrun: bool = False, skiplist: set = set(), allowlist = None) -> list:
         """
         This function does a single cycle of looking for new jobs
         """
@@ -262,8 +270,18 @@ def cycle(self) -> list:
         self.get_existing_jobs.cache_clear()
         job_recs = []
         for act in acts:
+            if act.was_informed_by in skiplist:
+                logging.debug(f"Skipping: {act.was_informed_by}")
+                continue
+            if allowlist and act.was_informed_by not in allowlist:
+                continue
             jobs = self.find_new_jobs(act)
             for job in jobs:
+                if dryrun:
+                    msg = f"new job: informed_by: {job.informed_by} trigger: {job.trigger_id} "
+                    msg += f"wf: {job.workflow.name}"
+                    logging.info(msg)
+                    continue
                 try:
                     jr = self.add_job_rec(job)
                     if jr:
@@ -274,16 +292,30 @@ def cycle(self) -> list:
         return job_recs
 
 
-def main():
+def main():  # pragma: no cover
     """
     Main function
     """
     sched = Scheduler(get_mongo_db())
+    dryrun = False
+    if os.environ.get("DRYRUN") == "1":
+        dryrun = True
+    skiplist = set()
+    allowlist = None
+    if os.environ.get("SKIPLISTFILE"):
+        with open(os.environ.get("SKIPLISTFILE")) as f:
+            for line in f:
+                skiplist.add(line.rstrip())
+    if os.environ.get("ALLOWLISTFILE"):
+        allowlist = set()
+        with open(os.environ.get("ALLOWLISTFILE")) as f:
+            for line in f:
+                allowlist.add(line.rstrip())
     while True:
-        sched.cycle()
+        sched.cycle(dryrun=dryrun, skiplist=skiplist, allowlist=allowlist)
         _sleep(_POLL_INTERVAL)
 
 
-if __name__ == "__main__":
+if __name__ == "__main__":  # pragma: no cover
     logging.basicConfig(level=logging.INFO)
     main()
diff --git a/test_data/omics_processing_set.json b/test_data/omics_processing_set.json
new file mode 100644
index 00000000..9e0943a8
--- /dev/null
+++ b/test_data/omics_processing_set.json
@@ -0,0 +1,31 @@
+[
+  {
+    "id": "nmdc:omprc-11-nhy4pz43",
+    "name": "Core terrestrial soil microbial communities from Talladega National Forest, Ozarks Complex, AL, USA - TALL_002-O-10-34-20140708-GEN-DNA1",
+    "has_input": [
+      "nmdc:bsm-11-7qhhd037"
+    ],
+    "has_output": [
+      "nmdc:22afa3d49b73eaec2e9787a6b88fbdc3"
+    ],
+    "add_date": "2020-01-27T00:00:00",
+    "mod_date": "2020-01-27T00:00:00",
+    "instrument_name": "Illumina HiSeq",
+    "ncbi_project_name": "Core terrestrial soil microbial communities from Talladega National Forest, Ozarks Complex, AL, USA - TALL_002-O-10-34-20140708-GEN-DNA1",
+    "omics_type": {
+      "has_raw_value": "Metagenome"
+    },
+    "part_of": [
+      "nmdc:sty-11-34xj1150"
+    ],
+    "principal_investigator": {
+      "has_raw_value": "Lee Stanish",
+      "email": "lstanish@gmail.com",
+      "name": "Lee Stanish"
+    },
+    "type": "nmdc:OmicsProcessing",
+    "gold_sequencing_project_identifiers": [
+      "GOLD:Gp0477109"
+    ]
+  }
+]
diff --git a/test_data/read_QC_analysis_activity_set.json b/test_data/read_QC_analysis_activity_set.json
index e37dcc6e..9d6740e8 100644
--- a/test_data/read_QC_analysis_activity_set.json
+++ b/test_data/read_QC_analysis_activity_set.json
@@ -7,7 +7,7 @@
       "nmdc:mga0vx38"
     ],
     "git_url": "https://github.com/microbiomedata/ReadsQC",
-    "version": "b1.0.7",
+    "version": "v1.0.7",
     "has_output": [
       "nmdc:f107af0a000ec0b90e157fc09473c337",
       "nmdc:71528f677698dd6657ea7ddcc3105184"
diff --git a/test_data/read_qc_analysis_activity_set2.json b/test_data/read_qc_analysis_activity_set2.json
index 076cd272..ce9c618d 100644
--- a/test_data/read_qc_analysis_activity_set2.json
+++ b/test_data/read_qc_analysis_activity_set2.json
@@ -4,7 +4,7 @@
       "nmdc:22afa3d49b73eaec2e9787a6b88fbdc3"
     ],
     "git_url": "https://github.com/microbiomedata/ReadsQC",
-    "version": "b1.1.8",
+    "version": "v1.1.8",
     "has_output": [
       "nmdc:f107af0a000ec0b90e157fc09473c337v2",
       "nmdc:71528f677698dd6657ea7ddcc3105184v2"

From 773f176f7b540b628e9ff41469266d501bd16fd0 Mon Sep 17 00:00:00 2001
From: Shane Canon <scanon@lbl.gov>
Date: Tue, 18 Jul 2023 14:30:47 -0700
Subject: [PATCH 2/4] Fix bad merge in workflows.yaml and fix Dockerfile

---
 Dockerfile             | 11 +++++++---
 configs/workflows.yaml | 47 ------------------------------------------
 2 files changed, 8 insertions(+), 50 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 390bd56d..9158da2b 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,10 +1,15 @@
 FROM python:3.9
 
-ADD requirements.txt /tmp/requirements.txt
+RUN \
+    pip install poetry && \
+    poetry config virtualenvs.create false
 
-RUN pip install -r /tmp/requirements.txt
+ADD pyproject.toml poetry.lock README.md /src/
+WORKDIR /src
+RUN \
+    poetry install --only=main --no-root
 
+RUN pip install semver
 
 ADD . /src
 
-WORKDIR /src
diff --git a/configs/workflows.yaml b/configs/workflows.yaml
index 027a033b..fa407a58 100644
--- a/configs/workflows.yaml
+++ b/configs/workflows.yaml
@@ -44,17 +44,14 @@ Workflows:
     Outputs:
       - output: filtered_final
         name: Reads QC result fastq (clean data)
-        suffix: "_filtered.fastq.gz"
         data_object_type: Filtered Sequencing Reads
         description: "Reads QC for {id}"
       - output: filtered_stats_final
         name: Reads QC summary statistics
-        suffix: "_filterStats.txt"
         data_object_type: QC Statistics
         description: "Reads QC summary for {id}"
       - output: rqc_info
         name: File containing read filtering information
-        suffix: "_readsQC.info"
         data_object_type: Read Filtering Info File
         description: "Read filtering info for {id}"
 
@@ -86,17 +83,14 @@ Workflows:
     Outputs:
       - output: filtered_final
         name: Reads QC result fastq (clean data)
-        suffix: "_filtered.fastq.gz"
         data_object_type: Filtered Sequencing Reads
         description: "Reads QC for {id}"
       - output: filtered_stats_final
         name: Reads QC summary statistics
-        suffix: "_filterStats.txt"
         data_object_type: QC Statistics
         description: "Reads QC summary for {id}"
       - output: rqc_info
         name: File containing read filtering information
-        suffix: "_readsQC.info"
         data_object_type: Read Filtering Info File
         description: "Read filtering info for {id}"
 
@@ -146,32 +140,26 @@ Workflows:
     Outputs:
       - output: contig
         name: Final assembly contigs fasta
-        suffix: "_contigs.fna"
         data_object_type: Assembly Contigs
         description: "Assembly contigs for {id}"
       - output: scaffold
         name: Final assembly scaffolds fasta
-        suffix: "_scaffolds.fna"
         data_object_type: Assembly Scaffolds
         description: "Assembly scaffolds for {id}"
       - output: covstats
         name: Assembled contigs coverage information
-        suffix: "_covstats.txt"
         data_object_type: Assembly Coverage Stats
         description: "Coverage Stats for {id}"
       - output: agp
         name: An AGP format file that describes the assembly
-        suffix: "_assembly.agp"
         data_object_type: Assembly AGP
         description: "AGP for {id}"
       - output: bam
         name: Sorted bam file of reads mapping back to the final assembly
-        suffix: "_pairedMapped_sorted.bam"
         data_object_type: Assembly Coverage BAM
         description: "Sorted Bam for {id}"
       - output: asminfo
         name: File containing assembly info
-        suffix: "_metaAsm.info"
         data_object_type: Assembly Info File
         description: "Assembly info for {id}"
 
@@ -197,22 +185,18 @@ Workflows:
         data_object_type: Annotation Amino Acid FASTA
         description: FASTA Amino Acid File for {id}
         name: FASTA amino acid file for annotated proteins
-        suffix: _proteins.faa
       - output: structural_gff
         data_object_type: Structural Annotation GFF
         description: Structural Annotation for {id}
         name: GFF3 format file with structural annotations
-        suffix: _structural_annotation.gff
       - output: functional_gff
         data_object_type: Functional Annotation GFF
         description: Functional Annotation for {id}
         name: GFF3 format file with functional annotations
-        suffix: _functional_annotation.gff
       - output: ko_tsv
         data_object_type: Annotation KEGG Orthology
         description: KEGG Orthology for {id}
         name: Tab delimited file for KO annotation
-        suffix: _ko.tsv
       - output: ec_tsv
         data_object_type: Annotation Enzyme Commission
         description: EC Annotations for {id}
@@ -227,92 +211,74 @@ Workflows:
         data_object_type: Clusters of Orthologous Groups (COG) Annotation GFF
         description: COGs for {id}
         name: GFF3 format file with COGs
-        suffix: _cog.gff
       - output: pfam_gff
         data_object_type: Pfam Annotation GFF
         description: Pfam Annotation for {id}
         name: GFF3 format file with Pfam
-        suffix: _pfam.gff
       - output: tigrfam_gff
         data_object_type: TIGRFam Annotation GFF
         description: TIGRFam for {id}
         name: GFF3 format file with TIGRfam
-        suffix: _tigrfam.gff
       - output: smart_gff
         data_object_type: SMART Annotation GFF
         description: SMART Annotations for {id}
         name: GFF3 format file with SMART
-        suffix: _smart.gff
       - output: supfam_gff
         data_object_type: SUPERFam Annotation GFF
         description: SUPERFam Annotations for {id}
         name: GFF3 format file with SUPERFam
-        suffix: _supfam.gff
       - output: cath_funfam_gff
         data_object_type: CATH FunFams (Functional Families) Annotation GFF
         description: CATH FunFams for {id}
         name: GFF3 format file with CATH FunFams
-        suffix: _cath_funfam.gff
       - output: crt_gff
         data_object_type: CRT Annotation GFF
         description: CRT Annotations for {id}
         name: GFF3 format file with CRT
-        suffix: _crt.gff
       - output: genemark_gff
         data_object_type: Genmark Annotation GFF
         description: Genemark Annotations for {id}
         name: GFF3 format file with Genemark
-        suffix: _genemark.gff
       - output: prodigal_gff
         data_object_type: Prodigal Annotation GFF
         description: Prodigal Annotations {id}
         name: GFF3 format file with Prodigal
-        suffix: _prodigal.gff
       - output: trna_gff
         data_object_type: TRNA Annotation GFF
         description: TRNA Annotations {id}
         name: GFF3 format file with TRNA
-        suffix: _trna.gff
       - output: final_rfam_gff
         data_object_type: RFAM Annotation GFF
         description: RFAM Annotations for {id}
         name: GFF3 format file with RFAM
-        suffix: _rfam.gff
       - output: ko_ec_gff
         data_object_type: KO_EC Annotation GFF
         description: KO_EC Annotations for {id}
         name: GFF3 format file with KO_EC
-        suffix: _ko_ec.gff
       - output: product_names_tsv
         data_object_type: Product Names
         description: Product names for {id}
         name: Product names file
-        suffix: _product_names.tsv
       - output: gene_phylogeny_tsv
         data_object_type: Gene Phylogeny tsv
         description: Gene Phylogeny for {id}
         name: Gene Phylogeny file
-        suffix: _gene_phylogeny.tsv
       - output: crt_crisprs
         data_object_type: Crisprt Terms
         description: Crispr Terms for {id}
         name: Crispr Terms
-        suffix: _crt.crisprs
       - output: stats_tsv
         data_object_type: Annotation Statistics
         description: Annotation Stats for {id}
         name: Annotation statistics report
-        suffix: _stats.tsv
       - output: contig_mapping
         data_object_type: Contig Mapping File
         description: Conging mappings file for {id}
         name: Contig mappings between contigs and scaffolds
-        suffix: _contig_names_mapping.tsv
       - output: imgap_version
         data_object_type: Annotation Info File
         description: Annotation info for {id}
         name: File containing annotation info
-        suffix: _imgap.info
 
   - Name: MAGs
     Type: nmdc:MAGsAnalysisActivity
@@ -350,17 +316,14 @@ Workflows:
       data_object_type: CheckM Statistics
       description: CheckM for {id}
       name: CheckM statistics report
-      suffix: _checkm_qa.out
     - output: final_hqmq_bins_zip
       data_object_type: Metagenome Bins
       description: Metagenome Bins for {id}
       name: Metagenome bin tarfiles archive
-      suffix: _hqmq_bin.zip
     - output: final_gtdbtk_bac_summary
       data_object_type: GTDBTK Bacterial Summary
       description: Bacterial Summary for {id}
       name: GTDBTK bacterial summary
-      suffix: _gtdbtk.bac122.summary.tsv
     - output: final_gtdbtk_ar_summary
       data_object_type: GTDBTK Archaeal Summary
       description: Archaeal Summary for {id}
@@ -370,7 +333,6 @@ Workflows:
       data_object_tye: Metagenome Bins Info File
       description: Metagenome Bins Info File for {id}
       name: Metagenome Bins Info File
-      suffix: _bin.info
 
   - Name: Readbased Analysis
     Type: nmdc:ReadBasedTaxonomyAnalysisActivity
@@ -393,47 +355,38 @@ Workflows:
       data_object_type: GOTTCHA2 Classification Report
       description: GOTTCHA2 Classification for {id}
       name: GOTTCHA2 classification report file
-      suffix: _gottcha2_report.tsv
     - output: final_gottcha2_full_tsv
       data_object_type: GOTTCHA2 Report Full
       description: GOTTCHA2 Full Report for {id}
       name: GOTTCHA2 report file
-      suffix: _gottcha2_full_tsv
     - output: final_gottcha2_krona_html
       data_object_type: GOTTCHA2 Krona Plot
       description: GOTTCHA2 Krona for {id}
       name: GOTTCHA2 krona plot HTML file
-      suffix: _gottcha2_krona.html
     - output: final_centrifuge_classification_tsv
       data_object_type: Centrifuge Taxonomic Classification
       description: Centrifuge Classification for {id}
       name: Centrifuge output read classification file
-      suffix: _centrifuge_classification.tsv
     - output: final_centrifuge_report_tsv
       data_object_type: Centrifuge output report file
       description: Centrifuge Report for {id}
       name: Centrifuge Classification Report
-      suffix: _centrifuge_report.tsv
     - output: final_centrifuge_krona_html
       data_object_type: Centrifuge Krona Plot
       description: Centrifuge Krona for {id}
       name: Centrifug krona plot HTML file
-      suffix: _centrifuge_krona.html
     - output: final_kraken2_classification_tsv
       data_object_type: Kraken2 Taxonomic Classification
       description: Kraken2 Classification for {id}
       name: Kraken2 output read classification file
-      suffix: _kraken2_classification.tsv
     - output: final_kraken2_report_tsv
       data_object_type: Kraken2 Classification Report
       description: Kraken2 Report for {id}
       name: Kraken2 output report file
-      suffix: _kraken2_report.tsv
     - output: final_kraken2_krona_html
       data_object_type: Kraken2 Krona Plot
       description: Kraken2 Krona for {id}
       name: Kraken2 Krona plot HTML file
-      suffix: _kraken2_krona.html
     - output: info_file
       data_object_type: Read Based Analysis Info File
       description: Read based analysis info for {id}

From d6684f2a9f43742e9ba3a2a00ea22d950c7ab2fa Mon Sep 17 00:00:00 2001
From: Shane Canon <scanon@lbl.gov>
Date: Thu, 26 Oct 2023 17:22:46 -0700
Subject: [PATCH 3/4] Sync up with other version

---
 configs/workflows2.yaml | 60 +++++------------------------------------
 1 file changed, 7 insertions(+), 53 deletions(-)

diff --git a/configs/workflows2.yaml b/configs/workflows2.yaml
index 1003b52b..eb1204d5 100644
--- a/configs/workflows2.yaml
+++ b/configs/workflows2.yaml
@@ -43,12 +43,10 @@ Workflows:
     Outputs:
       - output: filtered_final
         name: Reads QC result fastq (clean data)
-        suffix: "_filtered.fastq.gz"
         data_object_type: Filtered Sequencing Reads
         description: "Reads QC for {id}"
       - output: filtered_stats_final
         name: Reads QC summary statistics
-        suffix: "_filterStats.txt"
         data_object_type: QC Statistics
         description: "Reads QC summary for {id}"
 
@@ -78,12 +76,10 @@ Workflows:
     Outputs:
       - output: filtered_final
         name: Reads QC result fastq (clean data)
-        suffix: "_filtered.fastq.gz"
         data_object_type: Filtered Sequencing Reads
         description: "Reads QC for {id}"
       - output: filtered_stats_final
         name: Reads QC summary statistics
-        suffix: "_filterStats.txt"
         data_object_type: QC Statistics
         description: "Reads QC summary for {id}"
 
@@ -98,10 +94,10 @@ Workflows:
     Inputs:
       proj: "{activity_id}"
       input_file_1: do:Metagenome Raw Read 1
-      input_file_1: do:Metagenome Raw Read 2
+      input_file_2: do:Metagenome Raw Read 2
     Filter Input Objects:
-    - Metagenome Raw Reads 1
-    - Metagenome Raw Reads 2
+    - Metagenome Raw Read 1
+    - Metagenome Raw Read 2
     Predecessors:
     - Sequencing Noninterleaved
     Input_prefix: nmdc_rqcfilter
@@ -115,17 +111,14 @@ Workflows:
     Outputs:
       - output: filtered_final
         name: Reads QC result fastq (clean data)
-        suffix: "_filtered.fastq.gz"
         data_object_type: Filtered Sequencing Reads
         description: "Reads QC for {id}"
       - output: filtered_stats_final
         name: Reads QC summary statistics
-        suffix: "_filterStats.txt"
         data_object_type: QC Statistics
         description: "Reads QC summary for {id}"
       - output: rqc_info
         name: File containing read filtering information
-        suffix: "_readsQC.info"
         data_object_type: Read Filtering Info File
         description: "Read filtering info for {id}"
 
@@ -133,7 +126,7 @@ Workflows:
     Type: nmdc:MetagenomeAssembly
     Enabled: True
     Git_repo: https://github.com/microbiomedata/metaAssembly
-    Version: v1.0.4-beta
+    Version: v1.0.3
     WDL: jgi_assembly.wdl
     Collection: metagenome_assembly_set
     Predecessors:
@@ -175,27 +168,22 @@ Workflows:
     Outputs:
       - output: contig
         name: Final assembly contigs fasta
-        suffix: "_contigs.fna"
         data_object_type: Assembly Contigs
         description: "Assembly contigs for {id}"
       - output: scaffold
         name: Final assembly scaffolds fasta
-        suffix: "_scaffolds.fna"
         data_object_type: Assembly Scaffolds
         description: "Assembly scaffolds for {id}"
       - output: covstats
         name: Assembled contigs coverage information
-        suffix: "_covstats.txt"
         data_object_type: Assembly Coverage Stats
         description: "Coverage Stats for {id}"
       - output: agp
         name: An AGP format file that describes the assembly
-        suffix: "_assembly.agp"
         data_object_type: Assembly AGP
         description: "AGP for {id}"
       - output: bam
         name: Sorted bam file of reads mapping back to the final assembly
-        suffix: "_pairedMapped_sorted.bam"
         data_object_type: Assembly Coverage BAM
         description: "Sorted Bam for {id}"
 
@@ -203,7 +191,7 @@ Workflows:
     Type: nmdc:MetagenomeAnnotationActivity
     Enabled: True
     Git_repo: https://github.com/microbiomedata/mg_annotation
-    Version: v1.0.2-beta
+    Version: v1.0.4
     WDL: annotation_full.wdl
     Collection: metagenome_annotation_activity_set
     Predecessors:
@@ -221,114 +209,93 @@ Workflows:
         data_object_type: Annotation Amino Acid FASTA
         description: FASTA Amino Acid File for {id}
         name: FASTA amino acid file for annotated proteins
-        suffix: _proteins.faa
       - output: structural_gff
         data_object_type: Structural Annotation GFF
         description: Structural Annotation for {id}
         name: GFF3 format file with structural annotations
-        suffix: _structural_annotation.gff
       - output: functional_gff
         data_object_type: Functional Annotation GFF
         description: Functional Annotation for {id}
         name: GFF3 format file with functional annotations
-        suffix: _functional_annotation.gff
       - output: ko_tsv
         data_object_type: Annotation KEGG Orthology
         description: KEGG Orthology for {id}
         name: Tab delimited file for KO annotation
-        suffix: _ko.tsv
       - output: ec_tsv
         data_object_type: Annotation Enzyme Commission
         description: EC Annotations for {id}
         name: Tab delimited file for EC annotation
-        suffix: _ec.tsv
       - output: cog_gff
         data_object_type: Clusters of Orthologous Groups (COG) Annotation GFF
         description: COGs for {id}
         name: GFF3 format file with COGs
-        suffix: _cog.gff
       - output: pfam_gff
         data_object_type: Pfam Annotation GFF
         description: Pfam Annotation for {id}
         name: GFF3 format file with Pfam
-        suffix: _pfam.gff
       - output: tigrfam_gff
         data_object_type: TIGRFam Annotation GFF
         description: TIGRFam for {id}
         name: GFF3 format file with TIGRfam
-        suffix: _tigrfam.gff
       - output: smart_gff
         data_object_type: SMART Annotation GFF
         description: SMART Annotations for {id}
         name: GFF3 format file with SMART
-        suffix: _smart.gff
       - output: supfam_gff
         data_object_type: SUPERFam Annotation GFF
         description: SUPERFam Annotations for {id}
         name: GFF3 format file with SUPERFam
-        suffix: _supfam.gff
       - output: cath_funfam_gff
         data_object_type: CATH FunFams (Functional Families) Annotation GFF
         description: CATH FunFams for {id}
         name: GFF3 format file with CATH FunFams
-        suffix: _cath_funfam.gff
       - output: crt_gff
         data_object_type: CRT Annotation GFF
         description: CRT Annotations for {id}
         name: GFF3 format file with CRT
-        suffix: _crt.gff
       - output: genemark_gff
         data_object_type: Genmark Annotation GFF
         description: Genemark Annotations for {id}
         name: GFF3 format file with Genemark
-        suffix: _genemark.gff
       - output: prodigal_gff
         data_object_type: Prodigal Annotation GFF
         description: Prodigal Annotations {id}
         name: GFF3 format file with Prodigal
-        suffix: _prodigal.gff
       - output: trna_gff
         data_object_type: TRNA Annotation GFF
         description: TRNA Annotations {id}
         name: GFF3 format file with TRNA
-        suffix: _trna.gff
       - output: final_rfam_gff
         data_object_type: RFAM Annotation GFF
         description: RFAM Annotations for {id}
         name: GFF3 format file with RFAM
-        suffix: _rfam.gff
       - output: ko_ec_gff
         data_object_type: KO_EC Annotation GFF
         description: KO_EC Annotations for {id}
         name: GFF3 format file with KO_EC
-        suffix: _ko_ec.gff
       - output: product_names_tsv
         data_object_type: Product Names
         description: Product names for {id}
         name: Product names file
-        suffix: _product_names.tsv
       - output: gene_phylogeny_tsv
         data_object_type: Gene Phylogeny tsv
         description: Gene Phylogeny for {id}
         name: Gene Phylogeny file
-        suffix: _gene_phylogeny.tsv
       - output: crt_crisprs
         data_object_type: Crisprt Terms
         description: Crispr Terms for {id}
         name: Crispr Terms
-        suffix: _crt.crisprs
       - output: stats_tsv
         data_object_type: Annotation Statistics
         description: Annotation Stats for {id}
         name: Annotation statistics report
-        suffix: _stats.tsv
 
   - Name: MAGs
     Type: nmdc:MAGsAnalysisActivity
     Enabled: True
     Git_repo: https://github.com/microbiomedata/mg_annotation
     Git_repo: https://github.com/microbiomedata/metaMAGs
-    Version: v1.0.5-beta
+    Version: v1.0.6
     WDL: mbin_nmdc.wdl
     Collection: mags_activity_set
     Predecessors:
@@ -359,28 +326,24 @@ Workflows:
       data_object_type: CheckM Statistics
       description: CheckM for {id}
       name: CheckM statistics report
-      suffix: _checkm_qa.out
     - output: final_hqmq_bins_zip
       data_object_type: Metagenome Bins
       description: Metagenome Bins for {id}
       name: Metagenome bin tarfiles archive
-      suffix: _hqmq_bin.zip
     - output: final_gtdbtk_bac_summary
       data_object_type: GTDBTK Bacterial Summary
       description: Bacterial Summary for {id}
       name: GTDBTK bacterial summary
-      suffix: _gtdbtk.bac122.summary.tsv
     - output: final_gtdbtk_ar_summary
       data_object_type: GTDBTK Archaeal Summary
       description: Archaeal Summary for {id}
       name: GTDBTK archaeal summary
-      suffix: _gtdbtk.ar122.summary.tsv
 
   - Name: Readbased Analysis
     Type: nmdc:ReadBasedTaxonomyAnalysisActivity
     Enabled: True
     Git_repo: https://github.com/microbiomedata/ReadbasedAnalysis
-    Version: v1.0.5-beta
+    Version: v1.0.5
     WDL: ReadbasedAnalysis.wdl
     Collection: read_based_taxonomy_analysis_activity_set
     Predecessors:
@@ -397,45 +360,36 @@ Workflows:
       data_object_type: GOTTCHA2 Classification Report
       description: GOTTCHA2 Classification for {id}
       name: GOTTCHA2 classification report file
-      suffix: _gottcha2_report.tsv
     - output: final_gottcha2_full_tsv
       data_object_type: GOTTCHA2 Report Full
       description: GOTTCHA2 Full Report for {id}
       name: GOTTCHA2 report file
-      suffix: _gottcha2_full_tsv
     - output: final_gottcha2_krona_html
       data_object_type: GOTTCHA2 Krona Plot
       description: GOTTCHA2 Krona for {id}
       name: GOTTCHA2 krona plot HTML file
-      suffix: _gottcha2_krona.html
     - output: final_centrifuge_classification_tsv
       data_object_type: Centrifuge Taxonomic Classification
       description: Centrifuge Classification for {id}
       name: Centrifuge output read classification file
-      suffix: _centrifuge_classification.tsv
     - output: final_centrifuge_report_tsv
       data_object_type: Centrifuge output report file
       description: Centrifuge Report for {id}
       name: Centrifuge Classification Report
-      suffix: _centrifuge_report.tsv
     - output: final_centrifuge_krona_html
       data_object_type: Centrifuge Krona Plot
       description: Centrifuge Krona for {id}
       name: Centrifug krona plot HTML file
-      suffix: _centrifuge_krona.html
     - output: final_kraken2_classification_tsv
       data_object_type: Kraken2 Taxonomic Classification
       description: Kraken2 Classification for {id}
       name: Kraken2 output read classification file
-      suffix: _kraken2_classification.tsv
     - output: final_kraken2_report_tsv
       data_object_type: Kraken2 Classification Report
       description: Kraken2 Report for {id}
       name: Kraken2 output report file
-      suffix: _kraken2_report.tsv
     - output: final_kraken2_krona_html
       data_object_type: Kraken2 Krona Plot
       description: Kraken2 Krona for {id}
       name: Kraken2 Krona plot HTML file
-      suffix: _kraken2_krona.html
 

From a736b084c41b5b1c0d364ef9a24e73ff9d69c286 Mon Sep 17 00:00:00 2001
From: Shane Canon <scanon@lbl.gov>
Date: Thu, 26 Oct 2023 17:23:14 -0700
Subject: [PATCH 4/4] Adding logging

---
 nmdc_automation/workflow_automation/activities.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/nmdc_automation/workflow_automation/activities.py b/nmdc_automation/workflow_automation/activities.py
index 0e6ab546..f1580f46 100644
--- a/nmdc_automation/workflow_automation/activities.py
+++ b/nmdc_automation/workflow_automation/activities.py
@@ -53,6 +53,7 @@ def _read_acitivites(db, workflows: List[Workflow],
     """
     activities = []
     for wf in workflows:
+        logging.debug(f"Checking {wf.name}:{wf.version}")
         q = filter
         q['git_url'] = wf.git_repo
         q['version'] = wf.version