Skip to content

Commit

Permalink
Merge pull request #25 from icgc-argo/[email protected].…
Browse files Browse the repository at this point in the history
…1-1.3.1

[release]
  • Loading branch information
junjun-zhang authored Feb 15, 2021
2 parents 30f0d3f + 9343071 commit 80029df
Show file tree
Hide file tree
Showing 67 changed files with 22,611 additions and 3 deletions.
2 changes: 1 addition & 1 deletion demo-dna-seq-processing-wf/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@

nextflow.enable.dsl = 2
name = 'dna-seq-processing'
version = '1.7.0-1.3.0'
version = '1.7.1-1.3.1'


params.ref_genome_fa = ""
Expand Down
4 changes: 2 additions & 2 deletions demo-dna-seq-processing-wf/pkg.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "demo-dna-seq-alignment-wf",
"version": "1.7.0-1.3.0",
"version": "1.7.1-1.3.1",
"description": "DNA Seq Alignment Workflow with QC Metrics Collection",
"main": "main.nf",
"scripts": {
Expand All @@ -21,7 +21,7 @@
},
"dependencies": [
"github.com/icgc-argo/demo-wfpkgs/[email protected]",
"github.com/icgc-argo/demo-wfpkgs/[email protected].0",
"github.com/icgc-argo/demo-wfpkgs/demo-dna-seq-alignment-wf@1.7.1",
"github.com/icgc-argo/demo-wfpkgs/[email protected]"
],
"devDependencies": [],
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
.nextflow*
.gitignore
work
outdir
tests
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
FROM quay.io/icgc-argo/dna-seq-processing-tools:base-docker.0.1.1

LABEL org.opencontainers.image.source https://github.com/icgc-argo/demo-wfpkgs

ENV PATH="/tools:${PATH}"

WORKDIR /tools

COPY *.py /tools/

CMD ["/bin/bash"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
#!/usr/bin/env nextflow

/*
* Copyright (c) 2019-2021, Ontario Institute for Cancer Research (OICR).
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/

/*
* Contributors
* Junjun Zhang <[email protected]>
* Linda Xiang <[email protected]>
*/

/********************************************************************/
/* this block is auto-generated based on info from pkg.json where */
/* changes can be made if needed, do NOT modify this block manually */
nextflow.enable.dsl = 2
name = 'demo-aligned-seq-qc'
version = '1.1.0'
container = [
'ghcr.io': 'ghcr.io/icgc-argo/demo-wfpkgs.demo-aligned-seq-qc',
'quay.io': 'quay.io/icgc-argo/demo-wfpkgs.demo-aligned-seq-qc'
]
default_container_registry = 'ghcr.io'
/********************************************************************/

params.seq = ""
params.container_version = ""
params.ref_genome_gz = ""

params.container_registry = default_container_registry
params.publish_dir = ""
params.cpus = 1
params.mem = 2 // in GB


process alignedSeqQC {
container "${container[params.container_registry]}:${params.container_version ?: version}"
publishDir "${params.publish_dir}/${task.process.replaceAll(':', '_')}",
mode: "copy",
enabled: "${params.publish_dir ? true : ''}"

cpus params.cpus
memory "${params.mem} GB"

input:
path seq
path ref_genome_gz
path ref_genome_gz_secondary_file
val dependencies

output:
path "*.qc_metrics.tgz", emit: metrics

script:
"""
aligned-seq-qc.py -s ${seq} \
-r ${ref_genome_gz} \
-n ${params.cpus}
"""
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
#!/usr/bin/env python3

"""
Copyright (c) 2019-2021, Ontario Institute for Cancer Research (OICR).
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published
by the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
Contributors:
Junjun Zhang <[email protected]>
Linda Xiang <[email protected]>
"""

import os
import sys
from argparse import ArgumentParser
import subprocess
from multiprocessing import cpu_count
import tarfile
import glob
import json


def collect_metrics(args):
# generate stats_args string
stats_args = [
'--reference', args.reference,
'-@', str(args.cpus),
'-r', args.reference,
'--split', 'RG',
'-P', os.path.join(os.getcwd(), os.path.basename(args.seq))
]

try:
cmd = ['samtools', 'stats'] + stats_args + [args.seq]
p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, check=True)
except Exception as e:
sys.exit("Error: %s. 'samtools stats' failed: %s\n" % (e, args.seq))

with open(os.path.join(os.getcwd(), os.path.basename(args.seq)+".bamstat"), 'w') as f:
f.write(p.stdout.decode('utf-8'))

extra_info = {}
collected_sum_fields = {
'raw total sequences': 'total_reads',
'reads mapped': 'mapped_reads',
'reads paired': 'paired_reads',
'reads properly paired': 'properly_paired_reads',
'pairs on different chromosomes': 'pairs_on_different_chromosomes',
'total length': 'total_bases',
'bases mapped (cigar)': 'mapped_bases_cigar',
'mismatches': 'mismatch_bases',
'error rate': 'error_rate',
'bases duplicated': 'duplicated_bases',
'insert size average': 'average_insert_size',
'average length': 'average_length'
}
with open(os.path.join(os.getcwd(), os.path.basename(args.seq)+".bamstat"), 'r') as f:
for row in f:
if not row.startswith('SN\t'): continue
cols = row.replace(':', '').strip().split('\t')

if cols[1] not in collected_sum_fields: continue
extra_info.update({
collected_sum_fields[cols[1]]: float(cols[2]) if ('.' in cols[2] or 'e' in cols[2]) else int(cols[2])
})

p = subprocess.run('samtools --version | grep samtools', stdout=subprocess.PIPE, stderr=subprocess.STDOUT, check=True, shell=True)
tool_ver = "samtools:stats@%s" % p.stdout.decode('utf-8').strip().split(' ')[-1]
extra_info.update({ "tool": tool_ver })
with open(os.path.basename(args.seq) + '.extra_info.json', "w") as j:
j.write(json.dumps(extra_info, indent=2))

# make tar gzip ball of the *.bamstat files
tarfile_name = os.path.basename(args.seq)+'.qc_metrics.tgz'
with tarfile.open(tarfile_name, "w:gz") as tar:
for statsfile in glob.glob(os.path.join(os.getcwd(), "*.bamstat")) + glob.glob(os.path.join(os.getcwd(), "*.extra_info.json")):
tar.add(statsfile, arcname=os.path.basename(statsfile))


def main():
parser = ArgumentParser()
parser.add_argument("-s", "--seq", dest="seq", help="Aligned sequence file", type=str, required=True)
parser.add_argument('-r', '--reference', dest='reference', type=str, help='reference fasta', required=True)
parser.add_argument('-n', '--cpus', dest='cpus', type=int, help='number of cpu cores', default=cpu_count())

args = parser.parse_args()

collect_metrics(args)


if __name__ == '__main__':
main()
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
docker {
enabled = true
runOptions = '-u \$(id -u):\$(id -g)'
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
{
"name": "demo-aligned-seq-qc",
"version": "1.1.0",
"description": "Collect QC metrics from aligned sequencing reads",
"main": "aligned-seq-qc.nf",
"scripts": {
"test": "pushd tests && ./run_tests.sh; popd"
},
"deprecated": false,
"keywords": [
"bioinformatics",
"alignment",
"qc",
"metrics"
],
"repository": {
"type": "git",
"url": "https://github.com/icgc-argo/demo-wfpkgs.git"
},
"container": {
"registries": [
{
"registry": "quay.io",
"type": "docker",
"org": "icgc-argo",
"default": true
},
{
"registry": "ghcr.io",
"type": "docker",
"org": "icgc-argo",
"default": false
}
]
},
"dependencies": [],
"devDependencies": [
"github.com/icgc-argo/demo-wfpkgs/[email protected]"
],
"contributors": [
{
"name": "Junjun Zhang",
"email": "[email protected]"
},
{
"name": "Linda Xiang",
"email": "[email protected]"
}
],
"license": "AGPL-3.0",
"bugReport": "https://github.com/icgc-argo/demo-wfpkgs/issues",
"homepage": "https://github.com/icgc-argo/demo-wfpkgs#readme"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#!/usr/bin/env nextflow

/*
* Copyright (c) 2019-2020, Ontario Institute for Cancer Research (OICR).
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/

/*
* Contributors:
* Junjun Zhang <[email protected]>
* Linda Xiang <[email protected]>
*/

nextflow.preview.dsl = 2

params.seq = ""
params.container_version = ""
params.ref_genome_gz = ""

params.container_registry = ""
params.cpus = 1
params.mem = 2 // in GB


include { alignedSeqQC } from '../aligned-seq-qc.nf' params(params)
include { getSecondaryFiles } from './wfpr_modules/github.com/icgc-argo/demo-wfpkgs/[email protected]/main.nf'

Channel
.fromPath(params.seq, checkIfExists: true)
.set { aligned_seq }

Channel
.fromPath(getSecondaryFiles(params.ref_genome_gz, ['fai', 'gzi']), checkIfExists: true)
.set { ref_genome_gz_idx }

workflow {
alignedSeqQC(
aligned_seq.flatten(),
file(params.ref_genome_gz),
ref_genome_gz_idx.collect(),
true
)
}
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
includeConfig '../nextflow.config'
Loading

0 comments on commit 80029df

Please sign in to comment.