Skip to content

Commit

Permalink
Merge pull request #22 from microbiomedata/scanon/sched_features
Browse files Browse the repository at this point in the history
Updates to scheduler
  • Loading branch information
mbthornton-lbl authored Jan 12, 2024
2 parents 23a0ace + 972623e commit a73554a
Show file tree
Hide file tree
Showing 6 changed files with 75 additions and 123 deletions.
11 changes: 8 additions & 3 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,10 +1,15 @@
FROM python:3.9

ADD requirements.txt /tmp/requirements.txt
RUN \
pip install poetry && \
poetry config virtualenvs.create false

RUN pip install -r /tmp/requirements.txt
ADD pyproject.toml poetry.lock README.md /src/
WORKDIR /src
RUN \
poetry install --only=main --no-root

RUN pip install semver

ADD . /src

WORKDIR /src
55 changes: 4 additions & 51 deletions configs/workflows.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -44,17 +44,14 @@ Workflows:
Outputs:
- output: filtered_final
name: Reads QC result fastq (clean data)
suffix: "_filtered.fastq.gz"
data_object_type: Filtered Sequencing Reads
description: "Reads QC for {id}"
- output: filtered_stats_final
name: Reads QC summary statistics
suffix: "_filterStats.txt"
data_object_type: QC Statistics
description: "Reads QC summary for {id}"
- output: rqc_info
name: File containing read filtering information
suffix: "_readsQC.info"
data_object_type: Read Filtering Info File
description: "Read filtering info for {id}"

Expand Down Expand Up @@ -85,17 +82,14 @@ Workflows:
Outputs:
- output: filtered_final
name: Reads QC result fastq (clean data)
suffix: "_filtered.fastq.gz"
data_object_type: Filtered Sequencing Reads
description: "Reads QC for {id}"
- output: filtered_stats_final
name: Reads QC summary statistics
suffix: "_filterStats.txt"
data_object_type: QC Statistics
description: "Reads QC summary for {id}"
- output: rqc_info
name: File containing read filtering information
suffix: "_readsQC.info"
data_object_type: Read Filtering Info File
description: "Read filtering info for {id}"

Expand Down Expand Up @@ -145,32 +139,26 @@ Workflows:
Outputs:
- output: contig
name: Final assembly contigs fasta
suffix: "_contigs.fna"
data_object_type: Assembly Contigs
description: "Assembly contigs for {id}"
- output: scaffold
name: Final assembly scaffolds fasta
suffix: "_scaffolds.fna"
data_object_type: Assembly Scaffolds
description: "Assembly scaffolds for {id}"
- output: covstats
name: Assembled contigs coverage information
suffix: "_covstats.txt"
data_object_type: Assembly Coverage Stats
description: "Coverage Stats for {id}"
- output: agp
name: An AGP format file that describes the assembly
suffix: "_assembly.agp"
data_object_type: Assembly AGP
description: "AGP for {id}"
- output: bam
name: Sorted bam file of reads mapping back to the final assembly
suffix: "_pairedMapped_sorted.bam"
data_object_type: Assembly Coverage BAM
description: "Sorted Bam for {id}"
- output: asminfo
name: File containing assembly info
suffix: "_metaAsm.info"
data_object_type: Assembly Info File
description: "Assembly info for {id}"

Expand All @@ -196,22 +184,18 @@ Workflows:
data_object_type: Annotation Amino Acid FASTA
description: FASTA Amino Acid File for {id}
name: FASTA amino acid file for annotated proteins
suffix: _proteins.faa
- output: structural_gff
data_object_type: Structural Annotation GFF
description: Structural Annotation for {id}
name: GFF3 format file with structural annotations
suffix: _structural_annotation.gff
- output: functional_gff
data_object_type: Functional Annotation GFF
description: Functional Annotation for {id}
name: GFF3 format file with functional annotations
suffix: _functional_annotation.gff
- output: ko_tsv
data_object_type: Annotation KEGG Orthology
description: KEGG Orthology for {id}
name: Tab delimited file for KO annotation
suffix: _ko.tsv
- output: ec_tsv
data_object_type: Annotation Enzyme Commission
description: EC Annotations for {id}
Expand All @@ -226,92 +210,74 @@ Workflows:
data_object_type: Clusters of Orthologous Groups (COG) Annotation GFF
description: COGs for {id}
name: GFF3 format file with COGs
suffix: _cog.gff
- output: pfam_gff
data_object_type: Pfam Annotation GFF
description: Pfam Annotation for {id}
name: GFF3 format file with Pfam
suffix: _pfam.gff
- output: tigrfam_gff
data_object_type: TIGRFam Annotation GFF
description: TIGRFam for {id}
name: GFF3 format file with TIGRfam
suffix: _tigrfam.gff
- output: smart_gff
data_object_type: SMART Annotation GFF
description: SMART Annotations for {id}
name: GFF3 format file with SMART
suffix: _smart.gff
- output: supfam_gff
data_object_type: SUPERFam Annotation GFF
description: SUPERFam Annotations for {id}
name: GFF3 format file with SUPERFam
suffix: _supfam.gff
- output: cath_funfam_gff
data_object_type: CATH FunFams (Functional Families) Annotation GFF
description: CATH FunFams for {id}
name: GFF3 format file with CATH FunFams
suffix: _cath_funfam.gff
- output: crt_gff
data_object_type: CRT Annotation GFF
description: CRT Annotations for {id}
name: GFF3 format file with CRT
suffix: _crt.gff
- output: genemark_gff
data_object_type: Genemark Annotation GFF
description: Genemark Annotations for {id}
name: GFF3 format file with Genemark
suffix: _genemark.gff
- output: prodigal_gff
data_object_type: Prodigal Annotation GFF
description: Prodigal Annotations {id}
name: GFF3 format file with Prodigal
suffix: _prodigal.gff
- output: trna_gff
data_object_type: TRNA Annotation GFF
description: TRNA Annotations {id}
name: GFF3 format file with TRNA
suffix: _trna.gff
- output: final_rfam_gff
data_object_type: RFAM Annotation GFF
description: RFAM Annotations for {id}
name: GFF3 format file with RFAM
suffix: _rfam.gff
- output: ko_ec_gff
data_object_type: KO_EC Annotation GFF
description: KO_EC Annotations for {id}
name: GFF3 format file with KO_EC
suffix: _ko_ec.gff
- output: product_names_tsv
data_object_type: Product Names
description: Product names for {id}
name: Product names file
suffix: _product_names.tsv
- output: gene_phylogeny_tsv
data_object_type: Gene Phylogeny tsv
description: Gene Phylogeny for {id}
name: Gene Phylogeny file
suffix: _gene_phylogeny.tsv
- output: crt_crisprs
data_object_type: Crispr Terms
description: Crispr Terms for {id}
name: Crispr Terms
suffix: _crt.crisprs
- output: stats_tsv
data_object_type: Annotation Statistics
description: Annotation Stats for {id}
name: Annotation statistics report
suffix: _stats.tsv
# - output: contig_mapping
# data_object_type: Contig Mapping File
# description: Contig mappings file for {id}
# name: Contig mappings between contigs and scaffolds
# suffix: _contig_names_mapping.tsv
- output: contig_mapping
data_object_type: Contig Mapping File
description: Conging mappings file for {id}
name: Contig mappings between contigs and scaffolds
- output: imgap_version
data_object_type: Annotation Info File
description: Annotation info for {id}
name: File containing annotation info
suffix: _imgap.info

- Name: MAGs
Type: nmdc:MagsAnalysisActivity
Expand Down Expand Up @@ -350,17 +316,14 @@ Workflows:
data_object_type: CheckM Statistics
description: CheckM for {id}
name: CheckM statistics report
suffix: _checkm_qa.out
- output: final_hqmq_bins_zip
data_object_type: Metagenome Bins
description: Metagenome Bins for {id}
name: Metagenome bin tarfiles archive
suffix: _hqmq_bin.zip
- output: final_gtdbtk_bac_summary
data_object_type: GTDBTK Bacterial Summary
description: Bacterial Summary for {id}
name: GTDBTK bacterial summary
suffix: _gtdbtk.bac122.summary.tsv
- output: final_gtdbtk_ar_summary
data_object_type: GTDBTK Archaeal Summary
description: Archaeal Summary for {id}
Expand All @@ -370,7 +333,6 @@ Workflows:
data_object_type: Metagenome Bins Info File
description: Metagenome Bins Info File for {id}
name: Metagenome Bins Info File
suffix: _bin.info

- Name: Readbased Analysis
Type: nmdc:ReadBasedTaxonomyAnalysisActivity
Expand All @@ -394,47 +356,38 @@ Workflows:
data_object_type: GOTTCHA2 Classification Report
description: GOTTCHA2 Classification for {id}
name: GOTTCHA2 classification report file
suffix: _gottcha2_report.tsv
- output: final_gottcha2_full_tsv
data_object_type: GOTTCHA2 Report Full
description: GOTTCHA2 Full Report for {id}
name: GOTTCHA2 report file
suffix: _gottcha2_full_tsv
- output: final_gottcha2_krona_html
data_object_type: GOTTCHA2 Krona Plot
description: GOTTCHA2 Krona for {id}
name: GOTTCHA2 krona plot HTML file
suffix: _gottcha2_krona.html
- output: final_centrifuge_classification_tsv
data_object_type: Centrifuge Taxonomic Classification
description: Centrifuge Classification for {id}
name: Centrifuge output read classification file
suffix: _centrifuge_classification.tsv
- output: final_centrifuge_report_tsv
data_object_type: Centrifuge output report file
description: Centrifuge Report for {id}
name: Centrifuge Classification Report
suffix: _centrifuge_report.tsv
- output: final_centrifuge_krona_html
data_object_type: Centrifuge Krona Plot
description: Centrifuge Krona for {id}
name: Centrifug krona plot HTML file
suffix: _centrifuge_krona.html
- output: final_kraken2_classification_tsv
data_object_type: Kraken2 Taxonomic Classification
description: Kraken2 Classification for {id}
name: Kraken2 output read classification file
suffix: _kraken2_classification.tsv
- output: final_kraken2_report_tsv
data_object_type: Kraken2 Classification Report
description: Kraken2 Report for {id}
name: Kraken2 output report file
suffix: _kraken2_report.tsv
- output: final_kraken2_krona_html
data_object_type: Kraken2 Krona Plot
description: Kraken2 Krona for {id}
name: Kraken2 Krona plot HTML file
suffix: _kraken2_krona.html
- output: info_file
data_object_type: Read Based Analysis Info File
description: Read based analysis info for {id}
Expand Down
34 changes: 21 additions & 13 deletions nmdc_automation/workflow_automation/activities.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,31 +24,43 @@ def _load_data_objects(db, workflows: List[Workflow]):


def _check(match_types, data_object_ids, data_objs):
if not data_object_ids:
return False
if not match_types or len(match_types) == 0:
return True
match_set = set(match_types)
do_types = set()
for doid in data_object_ids:
do_types.add(data_objs[doid].data_object_type)
if doid in data_objs:
do_types.add(data_objs[doid].data_object_type)
return match_set.issubset(do_types)


def _filter_skip(wf, rec, data_objs):
match_in = _check(wf.filter_input_objects, rec["has_input"], data_objs)
match_out = _check(wf.filter_output_objects, rec["has_output"], data_objs)
match_in = _check(wf.filter_input_objects,
rec.get("has_input"),
data_objs)
match_out = _check(wf.filter_output_objects,
rec.get("has_output"),
data_objs)
return not (match_in and match_out)


def _read_acitivites(db, workflows: List[Workflow], data_objects: dict, filter: dict):
def _read_acitivites(db, workflows: List[Workflow],
data_objects: dict, filter: dict):
"""
Read in all the activities for the defined workflows.
"""
activities = []
for wf in workflows:
logging.debug(f"Checking {wf.name}:{wf.version}")
q = filter
q["git_url"] = wf.git_repo
q["version"] = wf.version
for rec in db[wf.collection].find(q):
if wf.collection == "omics_processing_set" and \
rec["id"].startswith("gold"):
continue
if _filter_skip(wf, rec, data_objects):
continue
act = Activity(rec, wf)
Expand Down Expand Up @@ -86,10 +98,8 @@ def _resolve_relationships(activities, data_obj_act):
# Let's make sure these came from the same source
# This is just a safeguard
if act.was_informed_by != parent_act.was_informed_by:
logging.warning(
"Mismatched informed by found for"
f"{do_id} in {act.id} ({act.name})"
)
logging.warning("Mismatched informed by for "
f"{do_id} in {act.id}")
continue
# We only want to use it as a parent if it is the right
# parent workflow. Some inputs may come from ancestors
Expand All @@ -98,13 +108,11 @@ def _resolve_relationships(activities, data_obj_act):
# This is the one
act.parent = parent_act
parent_act.children.append(act)
logging.debug(f"Found parent: {parent_act.id} {parent_act.name}")
logging.debug(f"Found parent: {parent_act.id}"
f" {parent_act.name}")
break
if len(act.workflow.parents) > 0 and not act.parent:
logging.warning(
"Didn't find a parent for "
f"{act.id} ({act.name}) - {act.workflow.name}"
)
logging.warning(f"Didn't find a parent for {act.id}")
# Now all the activities have their parent
return activities

Expand Down
Loading

0 comments on commit a73554a

Please sign in to comment.