Merge pull request #25 from icgc-argo/[email protected].…

…1-1.3.1 [release]
icgc-argo · Feb 15, 2021 · 80029df · 80029df
2 parents 30f0d3f + 9343071
commit 80029df
Show file tree

Hide file tree

Showing 67 changed files with 22,611 additions and 3 deletions.
diff --git a/demo-dna-seq-processing-wf/main.nf b/demo-dna-seq-processing-wf/main.nf
@@ -24,7 +24,7 @@
 
 nextflow.enable.dsl = 2
 name = 'dna-seq-processing'
-version = '1.7.0-1.3.0'
+version = '1.7.1-1.3.1'
 
 
 params.ref_genome_fa = ""

diff --git a/demo-dna-seq-processing-wf/pkg.json b/demo-dna-seq-processing-wf/pkg.json
@@ -1,6 +1,6 @@
 {
     "name": "demo-dna-seq-alignment-wf",
-    "version": "1.7.0-1.3.0",
+    "version": "1.7.1-1.3.1",
     "description": "DNA Seq Alignment Workflow with QC Metrics Collection",
     "main": "main.nf",
     "scripts": {
@@ -21,7 +21,7 @@
     },
     "dependencies": [
         "github.com/icgc-argo/demo-wfpkgs/[email protected]",
-        "github.com/icgc-argo/demo-wfpkgs/[email protected].0",
+        "github.com/icgc-argo/demo-wfpkgs/demo-dna-seq-alignment-wf@1.7.1",
         "github.com/icgc-argo/demo-wfpkgs/[email protected]"
     ],
     "devDependencies": [],

diff --git a/wfpr_modules/github.com/icgc-argo/demo-wfpkgs/[email protected]/.dockerignore b/wfpr_modules/github.com/icgc-argo/demo-wfpkgs/[email protected]/.dockerignore
@@ -0,0 +1,5 @@
+.nextflow*
+.gitignore
+work
+outdir
+tests
diff --git a/wfpr_modules/github.com/icgc-argo/demo-wfpkgs/[email protected]/Dockerfile b/wfpr_modules/github.com/icgc-argo/demo-wfpkgs/[email protected]/Dockerfile
@@ -0,0 +1,11 @@
+FROM quay.io/icgc-argo/dna-seq-processing-tools:base-docker.0.1.1
+
+LABEL org.opencontainers.image.source https://github.com/icgc-argo/demo-wfpkgs
+
+ENV PATH="/tools:${PATH}"
+
+WORKDIR /tools
+
+COPY *.py /tools/
+
+CMD ["/bin/bash"]
diff --git a/wfpr_modules/github.com/icgc-argo/demo-wfpkgs/[email protected]/aligned-seq-qc.nf b/wfpr_modules/github.com/icgc-argo/demo-wfpkgs/[email protected]/aligned-seq-qc.nf
@@ -0,0 +1,73 @@
+#!/usr/bin/env nextflow
+
+/*
+ * Copyright (c) 2019-2021, Ontario Institute for Cancer Research (OICR).
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <https://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Contributors
+ *   Junjun Zhang <[email protected]>
+ *   Linda Xiang <[email protected]>
+ */
+
+/********************************************************************/
+/* this block is auto-generated based on info from pkg.json where   */
+/* changes can be made if needed, do NOT modify this block manually */
+nextflow.enable.dsl = 2
+name = 'demo-aligned-seq-qc'
+version = '1.1.0'
+container = [
+    'ghcr.io': 'ghcr.io/icgc-argo/demo-wfpkgs.demo-aligned-seq-qc',
+    'quay.io': 'quay.io/icgc-argo/demo-wfpkgs.demo-aligned-seq-qc'
+]
+default_container_registry = 'ghcr.io'
+/********************************************************************/
+
+params.seq = ""
+params.container_version = ""
+params.ref_genome_gz = ""
+
+params.container_registry = default_container_registry
+params.publish_dir = ""
+params.cpus = 1
+params.mem = 2  // in GB
+
+
+process alignedSeqQC {
+  container "${container[params.container_registry]}:${params.container_version ?: version}"
+  publishDir "${params.publish_dir}/${task.process.replaceAll(':', '_')}",
+    mode: "copy",
+    enabled: "${params.publish_dir ? true : ''}"
+
+  cpus params.cpus
+  memory "${params.mem} GB"
+
+  input:
+    path seq
+    path ref_genome_gz
+    path ref_genome_gz_secondary_file
+    val dependencies
+
+  output:
+    path "*.qc_metrics.tgz", emit: metrics
+
+  script:
+    """
+    aligned-seq-qc.py -s ${seq} \
+                      -r ${ref_genome_gz} \
+                      -n ${params.cpus}
+    """
+}
diff --git a/wfpr_modules/github.com/icgc-argo/demo-wfpkgs/[email protected]/aligned-seq-qc.py b/wfpr_modules/github.com/icgc-argo/demo-wfpkgs/[email protected]/aligned-seq-qc.py
@@ -0,0 +1,103 @@
+#!/usr/bin/env python3
+
+"""
+ Copyright (c) 2019-2021, Ontario Institute for Cancer Research (OICR).
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published
+ by the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <https://www.gnu.org/licenses/>.
+
+ Contributors:
+   Junjun Zhang <[email protected]>
+   Linda Xiang <[email protected]>
+"""
+
+import os
+import sys
+from argparse import ArgumentParser
+import subprocess
+from multiprocessing import cpu_count
+import tarfile
+import glob
+import json
+
+
+def collect_metrics(args):
+  # generate stats_args string
+  stats_args = [
+    '--reference', args.reference,
+    '-@', str(args.cpus),
+    '-r', args.reference,
+    '--split', 'RG',
+    '-P', os.path.join(os.getcwd(), os.path.basename(args.seq))
+  ]
+
+  try:
+    cmd = ['samtools', 'stats'] + stats_args + [args.seq]
+    p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, check=True)
+  except Exception as e:
+    sys.exit("Error: %s. 'samtools stats' failed: %s\n" % (e, args.seq))
+
+  with open(os.path.join(os.getcwd(), os.path.basename(args.seq)+".bamstat"), 'w') as f:
+    f.write(p.stdout.decode('utf-8'))
+
+  extra_info = {}
+  collected_sum_fields = {
+    'raw total sequences': 'total_reads',
+    'reads mapped': 'mapped_reads',
+    'reads paired': 'paired_reads',
+    'reads properly paired': 'properly_paired_reads',
+    'pairs on different chromosomes': 'pairs_on_different_chromosomes',
+    'total length': 'total_bases',
+    'bases mapped (cigar)': 'mapped_bases_cigar',
+    'mismatches': 'mismatch_bases',
+    'error rate': 'error_rate',
+    'bases duplicated': 'duplicated_bases',
+    'insert size average': 'average_insert_size',
+    'average length': 'average_length'
+  }
+  with open(os.path.join(os.getcwd(), os.path.basename(args.seq)+".bamstat"), 'r') as f:
+    for row in f:
+      if not row.startswith('SN\t'): continue
+      cols = row.replace(':', '').strip().split('\t')
+
+      if cols[1] not in collected_sum_fields: continue
+      extra_info.update({
+          collected_sum_fields[cols[1]]: float(cols[2]) if ('.' in cols[2] or 'e' in cols[2]) else int(cols[2])
+        })
+
+  p = subprocess.run('samtools --version | grep samtools', stdout=subprocess.PIPE, stderr=subprocess.STDOUT, check=True, shell=True)
+  tool_ver = "samtools:stats@%s" % p.stdout.decode('utf-8').strip().split(' ')[-1]
+  extra_info.update({ "tool": tool_ver })
+  with open(os.path.basename(args.seq) + '.extra_info.json', "w") as j:
+    j.write(json.dumps(extra_info, indent=2))
+
+  # make tar gzip ball of the *.bamstat files
+  tarfile_name = os.path.basename(args.seq)+'.qc_metrics.tgz'
+  with tarfile.open(tarfile_name, "w:gz") as tar:
+    for statsfile in glob.glob(os.path.join(os.getcwd(), "*.bamstat")) + glob.glob(os.path.join(os.getcwd(), "*.extra_info.json")):
+      tar.add(statsfile, arcname=os.path.basename(statsfile))
+
+
+def main():
+  parser = ArgumentParser()
+  parser.add_argument("-s", "--seq", dest="seq", help="Aligned sequence file", type=str, required=True)
+  parser.add_argument('-r', '--reference', dest='reference', type=str, help='reference fasta', required=True)
+  parser.add_argument('-n', '--cpus', dest='cpus', type=int, help='number of cpu cores', default=cpu_count())
+
+  args = parser.parse_args()
+
+  collect_metrics(args)
+
+
+if __name__ == '__main__':
+  main()
diff --git a/wfpr_modules/github.com/icgc-argo/demo-wfpkgs/[email protected]/nextflow.config b/wfpr_modules/github.com/icgc-argo/demo-wfpkgs/[email protected]/nextflow.config
@@ -0,0 +1,4 @@
+docker {
+    enabled = true
+    runOptions = '-u \$(id -u):\$(id -g)'
+}
diff --git a/wfpr_modules/github.com/icgc-argo/demo-wfpkgs/[email protected]/pkg.json b/wfpr_modules/github.com/icgc-argo/demo-wfpkgs/[email protected]/pkg.json
@@ -0,0 +1,53 @@
+{
+    "name": "demo-aligned-seq-qc",
+    "version": "1.1.0",
+    "description": "Collect QC metrics from aligned sequencing reads",
+    "main": "aligned-seq-qc.nf",
+    "scripts": {
+        "test": "pushd tests && ./run_tests.sh; popd"
+    },
+    "deprecated": false,
+    "keywords": [
+        "bioinformatics",
+        "alignment",
+        "qc",
+        "metrics"
+    ],
+    "repository": {
+        "type": "git",
+        "url": "https://github.com/icgc-argo/demo-wfpkgs.git"
+    },
+    "container": {
+        "registries": [
+            {
+                "registry": "quay.io",
+                "type": "docker",
+                "org": "icgc-argo",
+                "default": true
+            },
+            {
+                "registry": "ghcr.io",
+                "type": "docker",
+                "org": "icgc-argo",
+                "default": false
+            }
+        ]
+    },
+    "dependencies": [],
+    "devDependencies": [
+        "github.com/icgc-argo/demo-wfpkgs/[email protected]"
+    ],
+    "contributors": [
+        {
+            "name": "Junjun Zhang",
+            "email": "[email protected]"
+        },
+        {
+            "name": "Linda Xiang",
+            "email": "[email protected]"
+        }
+    ],
+    "license": "AGPL-3.0",
+    "bugReport": "https://github.com/icgc-argo/demo-wfpkgs/issues",
+    "homepage": "https://github.com/icgc-argo/demo-wfpkgs#readme"
+}
diff --git a/wfpr_modules/github.com/icgc-argo/demo-wfpkgs/[email protected]/tests/checker.nf b/wfpr_modules/github.com/icgc-argo/demo-wfpkgs/[email protected]/tests/checker.nf
@@ -0,0 +1,55 @@
+#!/usr/bin/env nextflow
+
+/*
+ * Copyright (c) 2019-2020, Ontario Institute for Cancer Research (OICR).
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <https://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Contributors:
+ *   Junjun Zhang <[email protected]>
+ *   Linda Xiang <[email protected]>
+ */
+
+nextflow.preview.dsl = 2
+
+params.seq = ""
+params.container_version = ""
+params.ref_genome_gz = ""
+
+params.container_registry = ""
+params.cpus = 1
+params.mem = 2  // in GB
+
+
+include { alignedSeqQC } from '../aligned-seq-qc.nf' params(params)
+include { getSecondaryFiles } from './wfpr_modules/github.com/icgc-argo/demo-wfpkgs/[email protected]/main.nf'
+
+Channel
+  .fromPath(params.seq, checkIfExists: true)
+  .set { aligned_seq }
+
+Channel
+  .fromPath(getSecondaryFiles(params.ref_genome_gz, ['fai', 'gzi']), checkIfExists: true)
+  .set { ref_genome_gz_idx }
+
+workflow {
+  alignedSeqQC(
+    aligned_seq.flatten(),
+    file(params.ref_genome_gz),
+    ref_genome_gz_idx.collect(),
+    true
+  )
+}
diff --git a/...rgo/demo-wfpkgs/[email protected]/tests/input/SA610149.0.20200122.wgs.grch38.cram b/...rgo/demo-wfpkgs/[email protected]/tests/input/SA610149.0.20200122.wgs.grch38.cram
diff --git a/...gc-argo/demo-wfpkgs/[email protected]/tests/input/grch38-aligned.C0HVY_2.lane.bam b/...gc-argo/demo-wfpkgs/[email protected]/tests/input/grch38-aligned.C0HVY_2.lane.bam
diff --git a/...gc-argo/demo-wfpkgs/[email protected]/tests/input/grch38-aligned.D0RE2_1.lane.bam b/...gc-argo/demo-wfpkgs/[email protected]/tests/input/grch38-aligned.D0RE2_1.lane.bam
diff --git a/...gc-argo/demo-wfpkgs/[email protected]/tests/input/grch38-aligned.D0RH0_2.lane.bam b/...gc-argo/demo-wfpkgs/[email protected]/tests/input/grch38-aligned.D0RH0_2.lane.bam
diff --git a/..._modules/github.com/icgc-argo/demo-wfpkgs/[email protected]/tests/nextflow.config b/..._modules/github.com/icgc-argo/demo-wfpkgs/[email protected]/tests/nextflow.config
@@ -0,0 +1 @@
+includeConfig '../nextflow.config'