Merge pull request #27 from icgc-argo/[email protected]

[release]
icgc-argo · Feb 16, 2021 · 077af8f · 077af8f
2 parents 496d722 + 2f24a0e
commit 077af8f
Show file tree

Hide file tree

Showing 35 changed files with 11,155 additions and 4 deletions.
diff --git a/demo-dna-seq-alignment-wf/main.nf b/demo-dna-seq-alignment-wf/main.nf
@@ -26,7 +26,7 @@
 
 nextflow.enable.dsl = 2
 name = 'demo-dna-seq-alignment-wf'
-version = '1.7.1'
+version = '1.7.2'
 
 
 params.ref_genome_fa = ""
@@ -71,7 +71,7 @@ include {
 } from './wfpr_modules/github.com/icgc-argo/demo-wfpkgs/[email protected]/main.nf'
 
 include { bwaMemAligner as bwaMem } from './wfpr_modules/github.com/icgc-argo/demo-wfpkgs/[email protected]/bwa-mem-aligner.nf' params(bwaMemAligner_params)
-include { bamMergeSortMarkdup as merSorMkdup } from './wfpr_modules/github.com/icgc-argo/demo-wfpkgs/[email protected].0/bam-merge-sort-markdup.nf' params(bamMergeSortMarkdup_params)
+include { bamMergeSortMarkdup as merSorMkdup } from './wfpr_modules/github.com/icgc-argo/demo-wfpkgs/[email protected].1/bam-merge-sort-markdup.nf' params(bamMergeSortMarkdup_params)
 
 
 workflow DnaAln {

diff --git a/demo-dna-seq-alignment-wf/pkg.json b/demo-dna-seq-alignment-wf/pkg.json
@@ -1,6 +1,6 @@
 {
     "name": "demo-dna-seq-alignment-wf",
-    "version": "1.7.1",
+    "version": "1.7.2",
     "description": "DNA Seq Alignment Workflow",
     "main": "main.nf",
     "scripts": {
@@ -20,7 +20,7 @@
     },
     "dependencies": [
         "github.com/icgc-argo/demo-wfpkgs/[email protected]",
-        "github.com/icgc-argo/demo-wfpkgs/[email protected].0",
+        "github.com/icgc-argo/demo-wfpkgs/[email protected].1",
         "github.com/icgc-argo/demo-wfpkgs/[email protected]"
     ],
     "devDependencies": [],

diff --git a/...modules/github.com/icgc-argo/demo-wfpkgs/[email protected]/.dockerignore b/...modules/github.com/icgc-argo/demo-wfpkgs/[email protected]/.dockerignore
@@ -0,0 +1,5 @@
+.nextflow*
+.gitignore
+tests
+work
+outdir
diff --git a/wfpr_modules/github.com/icgc-argo/demo-wfpkgs/[email protected]/Dockerfile b/wfpr_modules/github.com/icgc-argo/demo-wfpkgs/[email protected]/Dockerfile
@@ -0,0 +1,11 @@
+FROM quay.io/icgc-argo/dna-seq-processing-tools:base-docker.0.2.0
+
+LABEL org.opencontainers.image.source https://github.com/icgc-argo/demo-wfpkgs
+
+ENV PATH="/tools:${PATH}"
+
+COPY *.py /tools/
+
+CMD ["/bin/bash"]
+
+
diff --git a/...ub.com/icgc-argo/demo-wfpkgs/[email protected]/bam-merge-sort-markdup.nf b/...ub.com/icgc-argo/demo-wfpkgs/[email protected]/bam-merge-sort-markdup.nf
@@ -0,0 +1,86 @@
+#!/usr/bin/env nextflow
+
+/*
+ * Copyright (c) 2019-2021, Ontario Institute for Cancer Research (OICR).
+ *                                                                                                               
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <https://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Contributors:
+ *   Junjun Zhang <[email protected]>
+ *   Linda Xiang <[email protected]>
+ */
+
+/********************************************************************/
+/* this block is auto-generated based on info from pkg.json where   */
+/* changes can be made if needed, do NOT modify this block manually */
+nextflow.enable.dsl = 2
+name = 'demo-bam-merge-sort-markdup'
+version = '1.12.1'
+container = [
+    'ghcr.io': 'ghcr.io/icgc-argo/demo-wfpkgs.demo-bam-merge-sort-markdup',
+    'quay.io': 'quay.io/icgc-argo/demo-wfpkgs.demo-bam-merge-sort-markdup'
+]
+default_container_registry = 'quay.io'
+/********************************************************************/
+
+params.aligned_lane_bams = ""
+params.ref_genome_gz = ""
+params.aligned_basename = "grch38-aligned.merged"
+params.markdup = true
+params.output_format = "cram"
+params.lossy = false
+params.container_version = ""
+params.container_registry = default_container_registry
+params.cpus = 1
+params.mem = 2  // in GB
+params.publish_dir = ""
+params.tempdir = ""
+
+
+process bamMergeSortMarkdup {
+  container "${container[params.container_registry]}:${params.container_version ?: version}"
+  publishDir "${params.publish_dir}/${task.process.replaceAll(':', '_')}",
+    mode: "copy",
+    enabled: "${params.publish_dir ? true : ''}"
+
+  cpus params.cpus
+  memory "${params.mem} GB"
+
+
+  input:
+    path aligned_lane_bams
+    path ref_genome_gz
+    path ref_genome_gz_secondary_file
+    path tempdir
+
+  output:
+    path "${params.aligned_basename}.{bam,cram}", emit: merged_seq
+    path "${params.aligned_basename}.{bam.bai,cram.crai}", emit: merged_seq_idx
+    path "${params.aligned_basename}.duplicates_metrics.tgz", optional: true, emit: duplicates_metrics
+
+  script:
+    arg_markdup = params.markdup ? "-d" : ""
+    arg_lossy = params.lossy ? "-l" : ""
+    arg_tempdir = tempdir.name != 'NO_DIR' ? "-t ${tempdir}" : ""
+    """
+    bam-merge-sort-markdup.py \
+      -i ${aligned_lane_bams} \
+      -r ${ref_genome_gz} \
+      -n ${params.cpus} \
+      -b ${params.aligned_basename} ${arg_markdup} \
+      -o ${params.output_format} ${arg_lossy} ${arg_tempdir}
+    """
+}
diff --git a/...ub.com/icgc-argo/demo-wfpkgs/[email protected]/bam-merge-sort-markdup.py b/...ub.com/icgc-argo/demo-wfpkgs/[email protected]/bam-merge-sort-markdup.py
@@ -0,0 +1,117 @@
+#!/usr/bin/env python3
+
+"""
+ Copyright (c) 2019-2021, Ontario Institute for Cancer Research (OICR).
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published
+ by the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <https://www.gnu.org/licenses/>.
+
+ Contributors:
+   Junjun Zhang <[email protected]>
+   Linda Xiang <[email protected]>
+"""
+
+
+import sys
+import subprocess
+import argparse
+from multiprocessing import cpu_count
+import json
+import os
+
+def run_cmd(cmd):
+  stdout, stderr, p, success = '', '', None, True
+  try:
+    p = subprocess.Popen([cmd],
+                          stdout=subprocess.PIPE,
+                          stderr=subprocess.PIPE,
+                          shell=True)
+    stdout, stderr = p.communicate()
+  except Exception as e:
+    print('Execution failed: %s' % e, file=sys.stderr)
+    success = False
+
+  if p and p.returncode != 0:
+    print('Execution failed, none zero code returned.', file=sys.stderr)
+    success = False
+
+  print(stdout.decode("utf-8"))
+  print(stderr.decode("utf-8"), file=sys.stderr)
+
+  if not success:
+    sys.exit(p.returncode if p.returncode else 1)
+
+  return stdout, stderr
+
+def main():
+    """ Main program """
+    parser = argparse.ArgumentParser(description='Merge and markdup')
+    parser.add_argument('-i','--input-bams', dest='input_bams',
+                        type=str, help='Input bam file', nargs='+', required=True)
+    parser.add_argument('-b','--output-base', dest='output_base',
+                        type=str, help='Output merged file basename', required=True)
+    parser.add_argument('-r', '--reference', dest='reference',
+                        type=str, help='reference fasta', required=True)
+    parser.add_argument('-t', '--tempdir', dest='tempdir', type=str, default=".",
+                        help='Specify directory for temporary files')
+    parser.add_argument("-n", "--cpus", dest='cpus', type=int, default=cpu_count())
+    parser.add_argument("-d", "--mdup", dest='mdup', action='store_true')
+    parser.add_argument("-l", "--lossy", dest='lossy', action='store_true')
+    parser.add_argument("-o", "--output-format", dest='output_format', default='cram', choices=['bam', 'cram'])
+
+    args = parser.parse_args()
+
+    cmd = []
+
+    if not os.path.isdir(args.tempdir):
+        sys.exit('Error: specified tempdir %s does not exist!' % args.tempdir)
+
+    if args.mdup:
+        merge = 'bammarkduplicates2 markthreads=%s tmpfile=%s/tmp level=0 O=/dev/stdout M=%s I=%s ' % \
+                (str(args.cpus), args.tempdir, args.output_base + ".duplicates_metrics.txt", ' I='.join(args.input_bams))
+    else:
+        merge = 'samtools merge --no-PG -uf -@ %s /dev/stdout %s ' % (str(args.cpus), ' '.join(args.input_bams))
+
+    if args.lossy:
+        cram = 'java -jar /tools/cramtools.jar cram -R %s --capture-all-tags --lossy-quality-score-spec \*8 --preserve-read-names -O %s' % (args.reference, args.output_base + ".cram")
+    else:
+        cram = 'samtools view -C -T %s -@ %s --write-index /dev/stdin -o %s ' % (args.reference, args.cpus, args.output_base + ".cram")
+
+    bam = 'samtools view -b -h -@ %s --write-index /dev/stdin -o %s##idx##%s ' % (args.cpus, args.output_base + ".bam", args.output_base + ".bam.bai")
+    crai1 = 'samtools index -@ %s %s %s ' % (args.cpus, args.output_base + ".cram", args.output_base + ".cram.crai")
+
+    # build command
+    if args.output_format == 'bam':
+        cmd.append('|'.join([merge, bam]))
+
+    elif args.output_format == 'cram':
+        cmd.append('|'.join([merge, cram]))
+        if args.lossy: cmd.append(crai1)
+    else:
+        sys.exit("Unsupported sequence format!")
+
+    for c in cmd:
+        run_cmd(c)
+
+    if os.path.isfile(os.path.join(os.getcwd(), args.output_base + ".duplicates_metrics.txt")):
+        stdout, _ = run_cmd('bammarkduplicates2  -v 2>&1  | grep "biobambam2 version"')
+        version = stdout.decode("utf-8").split(' ')[-1].strip().rstrip('.')
+        with open("%s.duplicates_metrics.extra_info.json" % args.output_base, "w") as j:
+          j.write(json.dumps({  "tool": "biobambam2:bammarkduplicates2@%s" % version }, indent=2))
+
+        tgz = 'tar czf %s.duplicates_metrics.tgz %s.duplicates_metrics.*' % (args.output_base, args.output_base)
+        run_cmd(tgz)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/...dules/github.com/icgc-argo/demo-wfpkgs/[email protected]/nextflow.config b/...dules/github.com/icgc-argo/demo-wfpkgs/[email protected]/nextflow.config
@@ -0,0 +1,4 @@
+docker {
+    enabled = true
+    runOptions = '-u \$(id -u):\$(id -g)'
+}
diff --git a/wfpr_modules/github.com/icgc-argo/demo-wfpkgs/[email protected]/pkg.json b/wfpr_modules/github.com/icgc-argo/demo-wfpkgs/[email protected]/pkg.json
@@ -0,0 +1,52 @@
+{
+    "name": "demo-bam-merge-sort-markdup",
+    "version": "1.12.1",
+    "description": "Merge multiple aligned BAMs, sort by coordinate and perform markduplicate",
+    "main": "bam-merge-sort-markdup.nf",
+    "scripts": {
+        "test": "pushd tests && ./run_tests.sh; popd"
+    },
+    "deprecated": false,
+    "keywords": [
+        "bioinformatics",
+        "alignment",
+        "markduplicate"
+    ],
+    "repository": {
+        "type": "git",
+        "url": "https://github.com/icgc-argo/demo-wfpkgs.git"
+    },
+    "container": {
+        "registries": [
+            {
+                "registry": "quay.io",
+                "type": "docker",
+                "org": "icgc-argo",
+                "default": true
+            },
+            {
+                "registry": "ghcr.io",
+                "type": "docker",
+                "org": "icgc-argo",
+                "default": false
+            }
+        ]
+    },
+    "dependencies": [],
+    "devDependencies": [
+        "github.com/icgc-argo/demo-wfpkgs/[email protected]"
+    ],
+    "contributors": [
+        {
+            "name": "Junjun Zhang",
+            "email": "[email protected]"
+        },
+        {
+            "name": "Linda Xiang",
+            "email": "[email protected]"
+        }
+    ],
+    "license": "AGPL-3.0",
+    "bugReport": "https://github.com/icgc-argo/demo-wfpkgs/issues",
+    "homepage": "https://github.com/icgc-argo/demo-wfpkgs#readme"
+}
diff --git a/...ules/github.com/icgc-argo/demo-wfpkgs/[email protected]/tests/checker.nf b/...ules/github.com/icgc-argo/demo-wfpkgs/[email protected]/tests/checker.nf
@@ -0,0 +1,56 @@
+#!/usr/bin/env nextflow
+
+/*
+ * Copyright (c) 2019-2021, Ontario Institute for Cancer Research (OICR).
+ *                                                                                                               
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <https://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Contributors:
+ *   Junjun Zhang <[email protected]>
+ */
+
+
+nextflow.enable.dsl=2
+
+params.aligned_lane_bams = ""
+params.ref_genome_gz = ""
+params.tempdir = "NO_DIR"
+params.container_registry = "ghcr.io"
+
+
+include { bamMergeSortMarkdup } from '../bam-merge-sort-markdup.nf' params(params)
+include { getSecondaryFiles } from './wfpr_modules/github.com/icgc-argo/demo-wfpkgs/[email protected]/main.nf'
+
+
+Channel
+  .fromPath(params.aligned_lane_bams, checkIfExists: true)
+  .set { aligned_lane_bams_ch }
+
+Channel
+  .fromPath(getSecondaryFiles(params.ref_genome_gz, ['fai', 'gzi']), checkIfExists: true)
+  .set { ref_genome_gz_idx_ch }
+
+
+// will not run when import as module
+workflow {
+  main:
+    bamMergeSortMarkdup(
+      aligned_lane_bams_ch.collect(),  // all lane bams to be merged
+      file(params.ref_genome_gz),
+      ref_genome_gz_idx_ch.collect(),
+      file(params.tempdir)
+    )
+}
diff --git a/...emo-wfpkgs/[email protected]/tests/input/grch38-aligned.C0HVY_2.lane.bam b/...emo-wfpkgs/[email protected]/tests/input/grch38-aligned.C0HVY_2.lane.bam
diff --git a/...emo-wfpkgs/[email protected]/tests/input/grch38-aligned.D0RE2_1.lane.bam b/...emo-wfpkgs/[email protected]/tests/input/grch38-aligned.D0RE2_1.lane.bam
diff --git a/...emo-wfpkgs/[email protected]/tests/input/grch38-aligned.D0RH0_2.lane.bam b/...emo-wfpkgs/[email protected]/tests/input/grch38-aligned.D0RH0_2.lane.bam
diff --git a/...ch38-aligned.TEST-PRO.DO250122.SA610149.C0HVY_2.2db5a19d9d5fdb9f534f68046bb2e8fa.lane.bam b/...ch38-aligned.TEST-PRO.DO250122.SA610149.C0HVY_2.2db5a19d9d5fdb9f534f68046bb2e8fa.lane.bam
diff --git a/...ch38-aligned.TEST-PRO.DO250122.SA610149.D0RE2_1.0a95ae6789a0d5b1af3c2a24ce377b0b.lane.bam b/...ch38-aligned.TEST-PRO.DO250122.SA610149.D0RE2_1.0a95ae6789a0d5b1af3c2a24ce377b0b.lane.bam
diff --git a/...ch38-aligned.TEST-PRO.DO250122.SA610149.D0RH0.2.231146e66d802729c719428e33e555a8.lane.bam b/...ch38-aligned.TEST-PRO.DO250122.SA610149.D0RH0.2.231146e66d802729c719428e33e555a8.lane.bam
diff --git a/...github.com/icgc-argo/demo-wfpkgs/[email protected]/tests/nextflow.config b/...github.com/icgc-argo/demo-wfpkgs/[email protected]/tests/nextflow.config
@@ -0,0 +1 @@
+includeConfig '../nextflow.config'