From 2844e5a6c56e572879a4a975efbb747238f176e3 Mon Sep 17 00:00:00 2001 From: Matthias De Smet Date: Fri, 6 Mar 2020 12:31:49 +0100 Subject: [PATCH] extra tools (#21) * gatk4 versions, add latest version * new gatk4 tool - mergebamalignment * syntax fixes * naming error * new tool: GATK4 FastqToSam * new tool: GATK4 FastqToSam * fix merge error * new tool: bamsormadup * new tool: io_lib Scramble * import and formatting fixes * fix version imports * fix bamsormadup tool args * bamsormadup: fix docker url * fix scramble tool args * scramble: fix output type * remove bool type from ToolArgument * bump gatk4 version * update fastaFai type, usually fai and dict are required together in workflows * Revert "update fastaFai type, usually fai and dict are required together in workflows" This reverts commit be8d8807d92ade6351c316f7b16bffb036af9914. --- janis_bioinformatics/tools/__init__.py | 2 + .../tools/biobambam/__init__.py | 1 + .../tools/biobambam/bamsormadup/__init__.py | 0 .../tools/biobambam/bamsormadup/base.py | 175 ++++++++ .../tools/biobambam/bamsormadup/versions.py | 13 + .../tools/biobambam/versions.py | 12 + janis_bioinformatics/tools/gatk4/__init__.py | 32 +- .../tools/gatk4/fastqtosam/__init__.py | 0 .../tools/gatk4/fastqtosam/base.py | 341 +++++++++++++++ .../tools/gatk4/fastqtosam/versions.py | 24 ++ .../tools/gatk4/mergebamalignment/__init__.py | 0 .../tools/gatk4/mergebamalignment/base.py | 390 ++++++++++++++++++ .../tools/gatk4/mergebamalignment/versions.py | 24 ++ janis_bioinformatics/tools/gatk4/versions.py | 16 +- janis_bioinformatics/tools/io_lib/__init__.py | 1 + .../tools/io_lib/scramble/__init__.py | 0 .../tools/io_lib/scramble/base.py | 227 ++++++++++ .../tools/io_lib/scramble/versions.py | 13 + janis_bioinformatics/tools/io_lib/versions.py | 12 + 19 files changed, 1266 insertions(+), 17 deletions(-) create mode 100644 janis_bioinformatics/tools/biobambam/__init__.py create mode 100644 janis_bioinformatics/tools/biobambam/bamsormadup/__init__.py create mode 100644 janis_bioinformatics/tools/biobambam/bamsormadup/base.py create mode 100644 janis_bioinformatics/tools/biobambam/bamsormadup/versions.py create mode 100644 janis_bioinformatics/tools/biobambam/versions.py create mode 100644 janis_bioinformatics/tools/gatk4/fastqtosam/__init__.py create mode 100644 janis_bioinformatics/tools/gatk4/fastqtosam/base.py create mode 100644 janis_bioinformatics/tools/gatk4/fastqtosam/versions.py create mode 100644 janis_bioinformatics/tools/gatk4/mergebamalignment/__init__.py create mode 100644 janis_bioinformatics/tools/gatk4/mergebamalignment/base.py create mode 100644 janis_bioinformatics/tools/gatk4/mergebamalignment/versions.py create mode 100644 janis_bioinformatics/tools/io_lib/__init__.py create mode 100644 janis_bioinformatics/tools/io_lib/scramble/__init__.py create mode 100644 janis_bioinformatics/tools/io_lib/scramble/base.py create mode 100644 janis_bioinformatics/tools/io_lib/scramble/versions.py create mode 100644 janis_bioinformatics/tools/io_lib/versions.py diff --git a/janis_bioinformatics/tools/__init__.py b/janis_bioinformatics/tools/__init__.py index 59f2a75f6..91b96f6a7 100644 --- a/janis_bioinformatics/tools/__init__.py +++ b/janis_bioinformatics/tools/__init__.py @@ -4,6 +4,7 @@ ) from janis_bioinformatics.tools import ( babrahambioinformatics, + biobambam, bcftools, bwa, common, @@ -14,6 +15,7 @@ gatk4, htslib, illumina, + io_lib, multiqc, papenfuss, pmac, diff --git a/janis_bioinformatics/tools/biobambam/__init__.py b/janis_bioinformatics/tools/biobambam/__init__.py new file mode 100644 index 000000000..900f763df --- /dev/null +++ b/janis_bioinformatics/tools/biobambam/__init__.py @@ -0,0 +1 @@ +from .bamsormadup.versions import * diff --git a/janis_bioinformatics/tools/biobambam/bamsormadup/__init__.py b/janis_bioinformatics/tools/biobambam/bamsormadup/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/janis_bioinformatics/tools/biobambam/bamsormadup/base.py b/janis_bioinformatics/tools/biobambam/bamsormadup/base.py new file mode 100644 index 000000000..bc53cf33c --- /dev/null +++ b/janis_bioinformatics/tools/biobambam/bamsormadup/base.py @@ -0,0 +1,175 @@ +from abc import ABC +from typing import Any, Dict + +from janis_core import ( + ToolInput, + ToolArgument, + WildcardSelector, + Int, + Float, + Boolean, + String, + ToolOutput, + Filename, + InputSelector, + CaptureType, + CpuSelector, + Stdout, + get_value_for_hints_and_ordered_resource_tuple, + ToolMetadata, +) + +from janis_bioinformatics.data_types import Sam, FastaWithDict, FastqGzPair, Bam, File +from janis_bioinformatics.tools.bioinformaticstoolbase import BioinformaticsTool + +BAMSORMADUP_MEM_TUPLE = [ + ( + CaptureType.key(), + { + CaptureType.TARGETED: 8, + CaptureType.EXOME: 12, + CaptureType.CHROMOSOME: 12, + CaptureType.THIRTYX: 16, + CaptureType.NINETYX: 20, + CaptureType.THREEHUNDREDX: 24, + }, + ) +] + +BAMSORMADUP_CORES_TUPLE = [ + ( + CaptureType.key(), + { + CaptureType.TARGETED: 16, + CaptureType.EXOME: 20, + CaptureType.CHROMOSOME: 24, + CaptureType.THIRTYX: 30, + CaptureType.NINETYX: 32, + CaptureType.THREEHUNDREDX: 32, + }, + ) +] + + +class BamSorMaDupBase(BioinformaticsTool, ABC): + def tool(self): + return "bamsormadup" + + def friendly_name(self): + return "BamSorMaDup" + + def tool_provider(self): + return "BioBamBam" + + def base_command(self): + return ["bamsormadup"] + + def inputs(self): + return [ + ToolInput("alignedReads", Bam(), position=200), + ToolInput("outputFilename", Filename(extension=".bam")), + *BamSorMaDupBase.additional_inputs, + ] + + def arguments(self): + return [ + ToolArgument( + "metrics.txt", + prefix="M=", + separate_value_from_prefix=False, + doc="file containing metrics from duplicate removal", + ), + ToolArgument( + "bam", + prefix="inputformat=", + separate_value_from_prefix=False, + doc="input data format", + ), + ToolArgument( + "bam", + prefix="outputFormat=", + separate_value_from_prefix=False, + doc="output data format", + ), + ] + + def outputs(self): + return [ + ToolOutput( + "out", Stdout(Bam(), stdoutname=InputSelector("outputFilename")) + ), + ToolOutput("metrics", File(), glob=WildcardSelector("metrics.txt")), + ] + + def memory(self, hints: Dict[str, Any]): + val = get_value_for_hints_and_ordered_resource_tuple( + hints, BAMSORMADUP_MEM_TUPLE + ) + if val: + return val + return 16 + + def cpus(self, hints: Dict[str, Any]): + val = get_value_for_hints_and_ordered_resource_tuple( + hints, BAMSORMADUP_CORES_TUPLE + ) + if val: + return val + return 4 + + def bind_metadata(self): + from datetime import date + + return ToolMetadata( + contributors=["Matthias De Smet (@mattdsm)"], + dateCreated=date(2020, 2, 26), + dateUpdated=date(2020, 2, 26), + institution="None", + doi=None, + keywords=["duplicates", "sort"], + documentationUrl="https://gitlab.com/german.tischler/biobambam2", + documentation="bamsormadup: parallel sorting and duplicate marking", + ) + + additional_inputs = [ + ToolInput( + "level", + Int(optional=True), + prefix="level=", + separate_value_from_prefix=False, + default=0, + doc="compression settings for output bam file (-1=zlib default,0=uncompressed,1=fast,9=best)", + ), + ToolInput( + "tempLevel", + Int(optional=True), + prefix="templevel=", + separate_value_from_prefix=False, + default=0, + doc="compression settings for temporary bam files (-1=zlib default,0=uncompressed,1=fast,9=best)", + ), + ToolInput( + "threads", + Int(optional=True), + default=CpuSelector(), + prefix="threads=", + separate_value_from_prefix=False, + doc="Number of threads. (default = 1)", + ), + ToolInput( + "sortOrder", + String(optional=True), + prefix="SO=", + separate_value_from_prefix=False, + default="coordinate", + doc="output sort order(coordinate by default)", + ), + ToolInput( + "optMinPixelDif", + Int(optional=True), + prefix="optminpixeldif=", + separate_value_from_prefix=False, + default=2500, + doc="pixel difference threshold for optical duplicates (patterned flowcell: 12000, unpatterned flowcell: 2500)", + ), + ] diff --git a/janis_bioinformatics/tools/biobambam/bamsormadup/versions.py b/janis_bioinformatics/tools/biobambam/bamsormadup/versions.py new file mode 100644 index 000000000..b455fc5a7 --- /dev/null +++ b/janis_bioinformatics/tools/biobambam/bamsormadup/versions.py @@ -0,0 +1,13 @@ +from ..versions import BioBamBam_2_0_87 +from .base import BamSorMaDupBase + + +class BamSorMaDup_2_0_87(BioBamBam_2_0_87, BamSorMaDupBase): + pass + + +BamSorMaDupLatest = BamSorMaDup_2_0_87 + + +if __name__ == "__main__": + print(BamSorMaDupLatest().help()) diff --git a/janis_bioinformatics/tools/biobambam/versions.py b/janis_bioinformatics/tools/biobambam/versions.py new file mode 100644 index 000000000..4e13b97f1 --- /dev/null +++ b/janis_bioinformatics/tools/biobambam/versions.py @@ -0,0 +1,12 @@ +from abc import ABC + + +class BioBamBam_2_0_87(ABC): + def container(self): + return "quay.io/biocontainers/biobambam:2.0.87--1" + + def version(self): + return "2.0.87" + + +BioBamBamLatest = BioBamBam_2_0_87 diff --git a/janis_bioinformatics/tools/gatk4/__init__.py b/janis_bioinformatics/tools/gatk4/__init__.py index cd66d8fd4..291517dab 100644 --- a/janis_bioinformatics/tools/gatk4/__init__.py +++ b/janis_bioinformatics/tools/gatk4/__init__.py @@ -18,12 +18,26 @@ Gatk4CalculateContamination_4_1_4, Gatk4CalculateContaminationLatest, ) +from .createsequencedictionary.versions import ( + Gatk4CreateSequenceDictionary_4_1_2, + Gatk4CreateSequenceDictionary_4_1_3, + Gatk4CreateSequenceDictionary_4_1_4, + Gatk4CreateSequenceDictionaryLatest, +) +from .fastqtosam.versions import * from .filtermutectcalls.versions import ( Gatk4FilterMutectCalls_4_1_2, Gatk4FilterMutectCalls_4_1_3, Gatk4FilterMutectCalls_4_1_4, Gatk4FilterMutectCallsLatest, ) +from .gathervcfs.versions import ( + Gatk4GatherVcfs_4_0, + Gatk4GatherVcfs_4_1_2, + Gatk4GatherVcfs_4_1_3, + Gatk4GatherVcfs_4_1_4, + Gatk4GatherVcfsLatest, +) from .genotypeconcordance.versions import ( Gatk4GenotypeConcordance_4_0, Gatk4GenotypeConcordance_4_1_2, @@ -57,6 +71,7 @@ Gatk4MarkDuplicates_4_1_4, Gatk4MarkDuplicatesLatest, ) +from .mergebamalignment.versions import * from .mergemutectstats.versions import ( Gatk4MergeMutectStats_4_1_2, Gatk4MergeMutectStats_4_1_3, @@ -91,24 +106,9 @@ Gatk4SortSam_4_1_4, Gatk4SortSamLatest, ) -from .gathervcfs.versions import ( - Gatk4GatherVcfs_4_0, - Gatk4GatherVcfs_4_1_2, - Gatk4GatherVcfs_4_1_3, - Gatk4GatherVcfs_4_1_4, - Gatk4GatherVcfsLatest, -) - from .splitreads.versions import ( + Gatk4SortSamLatest, Gatk4SplitReads_4_1_2, Gatk4SplitReads_4_1_3, Gatk4SplitReads_4_1_4, - Gatk4SortSamLatest, -) - -from .createsequencedictionary.versions import ( - Gatk4CreateSequenceDictionary_4_1_2, - Gatk4CreateSequenceDictionary_4_1_3, - Gatk4CreateSequenceDictionary_4_1_4, - Gatk4CreateSequenceDictionaryLatest, ) diff --git a/janis_bioinformatics/tools/gatk4/fastqtosam/__init__.py b/janis_bioinformatics/tools/gatk4/fastqtosam/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/janis_bioinformatics/tools/gatk4/fastqtosam/base.py b/janis_bioinformatics/tools/gatk4/fastqtosam/base.py new file mode 100644 index 000000000..92da8ab4f --- /dev/null +++ b/janis_bioinformatics/tools/gatk4/fastqtosam/base.py @@ -0,0 +1,341 @@ +from abc import ABC +from typing import Dict, Any + +from janis_core import ( + ToolInput, + Filename, + String, + ToolOutput, + Array, + File, + Int, + Boolean, + InputSelector, + CaptureType, +) +from janis_core import get_value_for_hints_and_ordered_resource_tuple +from janis_core import ToolMetadata + +from janis_bioinformatics.data_types import FastaWithDict, BamBai, FastqGz +from ..gatk4toolbase import Gatk4ToolBase + +CORES_TUPLE = [ + ( + CaptureType.key(), + { + CaptureType.CHROMOSOME: 1, + CaptureType.EXOME: 1, + CaptureType.THIRTYX: 1, + CaptureType.NINETYX: 1, + CaptureType.THREEHUNDREDX: 1, + }, + ) +] + +MEM_TUPLE = [ + ( + CaptureType.key(), + { + CaptureType.CHROMOSOME: 16, + CaptureType.EXOME: 16, + CaptureType.THIRTYX: 16, + CaptureType.NINETYX: 32, + CaptureType.THREEHUNDREDX: 32, + }, + ) +] + + +class Gatk4FastqToSamBase(Gatk4ToolBase, ABC): + @classmethod + def gatk_command(cls): + return "FastqToSam" + + def tool(self): + return "Gatk4FastqToSam" + + def friendly_name(self): + return "GATK4: Convert a FASTQ file to an unaligned BAM or SAM file." + + def cpus(self, hints: Dict[str, Any]): + val = get_value_for_hints_and_ordered_resource_tuple( + hints, CORES_TUPLE) + if val: + return val + return 1 + + def memory(self, hints: Dict[str, Any]): + val = get_value_for_hints_and_ordered_resource_tuple(hints, MEM_TUPLE) + if val: + return val + return 4 + + def inputs(self): + return [ + ToolInput( + "fastqR1", + FastqGz(), + prefix="--FASTQ", + prefix_applies_to_all_elements=True, + doc="Input fastq file (optionally gzipped) for single end data, or first read in paired end data.", + position=10, + ), + ToolInput( + "fastqR2", + FastqGz(optional=True), + prefix="--FASTQ2", + prefix_applies_to_all_elements=True, + doc="Input fastq file (optionally gzipped) for single end data, or first read in paired end data.", + position=10, + ), + ToolInput( + "sampleName", + String(optional=True), + prefix="--SAMPLE_NAME", + prefix_applies_to_all_elements=True, + doc="Input fastq file (optionally gzipped) for single end data, or first read in paired end data.", + position=10, + ), + ToolInput( + "reference", + FastaWithDict(optional=True), + prefix="--REFERENCE_SEQUENCE", + position=10, + doc="Reference sequence file.", + ), + ToolInput( + "outputFilename", + Filename(extension=".bam"), + position=10, + prefix="--OUTPUT", + doc="Merged SAM or BAM file to write to.", + ), + *self.additional_args, + ] + + def outputs(self): + return [ + ToolOutput( + "out", + BamBai(), + glob=InputSelector("outputFilename"), + secondaries_present_as={".bai": "^.bai"}, + ) + ] + + def bind_metadata(self): + from datetime import date + + return ToolMetadata( + contributors=[ + "Michael Franklin (@illisional)", "Matthias De Smet(@matthdsm)"], + dateCreated=date(2020, 2, 26), + dateUpdated=date(2020, 2, 26), + institution="Broad Institute", + doi=None, + citation="See https://software.broadinstitute.org/gatk/documentation/article?id=11027 for more information", + keywords=["gatk", "gatk4", "broad", "merge", "sam"], + documentationUrl="https://gatk.broadinstitute.org/hc/en-us/articles/360037226792-FastqToSam-Picard-", + documentation="Converts a FASTQ file to an unaligned BAM or SAM file.", + ) + + additional_args = [ + ToolInput( + "allowAndIgnoreEmptyLines", + Boolean(optional=True), + prefix="--ALLOW_AND_IGNORE_EMPTY_LINES", + position=11, + doc="Allow (and ignore) empty lines" + ), + ToolInput( + "argumentsFile", + Array(File(), optional=True), + prefix="--arguments_file", + position=11, + doc="read one or more arguments files and add them to the command line", + ), + ToolInput( + "comment", + Array(String(), optional=True), + prefix="--COMMENT", + position=11, + doc="Comment(s) to include in the merged output file's header.", + ), + ToolInput( + "description", + Array(String(), optional=True), + prefix="--DESCRIPTION", + position=11, + doc="Inserted into the read group header", + ), + ToolInput( + "libraryName", + Array(String(), optional=True), + prefix="--LIBRARY_NAME", + position=11, + doc="The library name to place into the LB attribute in the read group header", + ), + ToolInput( + "maxQ", + Int(optional=True), + prefix="--MAX_Q", + position=11, + doc="Maximum quality allowed in the input fastq. An exception will be thrown if a quality is greater than this value.", + ), + ToolInput( + "minQ", + Int(optional=True), + prefix="--MIN_Q", + position=11, + doc="Minimum quality allowed in the input fastq. An exception will be thrown if a quality is less than this value.", + ), + ToolInput( + "platform", + String(optional=True), + prefix="--PLATFORM", + position=11, + doc="The platform type (e.g. ILLUMINA, SOLID) to insert into the read group header." + ), + ToolInput( + "platformModel", + String(optional=True), + prefix="--PLATFORM_MODEL", + position=11, + doc="Platform model to insert into the group header (free-form text providing further details of the platform/technology used)." + ), + ToolInput( + "platformUnit", + String(optional=True), + prefix="--PLATFORM_UNIT", + position=11, + doc="The expected orientation of proper read pairs.", + ), + ToolInput( + "predictedInsertSize", + Int(optional=True), + prefix="--PREDICTED_INSERT_SIZE", + position=11, + doc="Predicted median insert size, to insert into the read group header." + ), + ToolInput( + "programGroup", + String(optional=True), + prefix="--PROGRAM_GROUP", + position=11, + doc="Program group to insert into the read group header." + ), + ToolInput( + "readGroupName", + String(optional=True), + prefix="--READ_GROUP_NAME", + position=11, + doc="Read group name." + ), + ToolInput( + "runDate", + String(optional=True), + prefix="--RUN_DATE", + position=11, + doc="Date the run was produced, to insert into the read group header" + ), + ToolInput( + "sequencingCenter", + String(optional=True), + prefix="--SEQUENCING_CENTER", + position=11, + doc="The sequencing center from which the data originated." + ), + ToolInput( + "sortOrder", + String(optional=True), + prefix="-SO", + position=10, + doc="The --SORT_ORDER argument is an enumerated type (SortOrder), which can have one of " + "the following values: [unsorted, queryname, coordinate, duplicate, unknown]", + ), + ToolInput( + "useSequenctialFastqs", + Boolean(optional=True), + prefix="--USE_SEQUENTIAL_FASTQS", + position=11, + doc="Use sequential fastq files with the suffix _###.fastq or _###.fastq.gz." + ), + ToolInput( + "compressionLevel", + Int(optional=True), + prefix="--COMPRESSION_LEVEL", + position=11, + doc="Compression level for all compressed files created (e.g. BAM and GELI).", + ), + ToolInput( + "createIndex", + Boolean(optional=True), + prefix="--CREATE_INDEX", + position=11, + doc="Whether to create a BAM index when writing a coordinate-sorted BAM file.", + ), + ToolInput( + "createMd5File", + Boolean(optional=True), + prefix="--CREATE_MD5_FILE", + position=11, + doc="Whether to create an MD5 digest for any BAM or FASTQ files created.", + ), + ToolInput( + "maxRecordsInRam", + Int(optional=True), + prefix="--MAX_RECORDS_IN_RAM", + position=11, + doc="When writing SAM files that need to be sorted, this will specify the number of " + "records stored in RAM before spilling to disk. Increasing this number reduces " + "the number of file handles needed to sort a SAM file, and increases the amount of RAM needed.", + ), + ToolInput( + "quiet", + Boolean(optional=True), + prefix="--QUIET", + position=11, + doc="Whether to suppress job-summary info on System.err.", + ), + ToolInput( + "tmpDir", + String(optional=True), + prefix="--TMP_DIR", + position=11, + default="/tmp/", + doc="Undocumented option", + ), + ToolInput( + "useJdkDeflater", + Boolean(optional=True), + prefix="--use_jdk_deflater", + position=11, + doc="Whether to use the JdkDeflater (as opposed to IntelDeflater)", + ), + ToolInput( + "useJdkInflater", + Boolean(optional=True), + prefix="--use_jdk_inflater", + position=11, + doc="Whether to use the JdkInflater (as opposed to IntelInflater)", + ), + ToolInput( + "validationStringency", + String(optional=True), + prefix="--VALIDATION_STRINGENCY", + position=11, + doc="Validation stringency for all SAM files read by this program. Setting stringency to SILENT " + "can improve performance when processing a BAM file in which variable-length data " + "(read, qualities, tags) do not otherwise need to be decoded." + "The --VALIDATION_STRINGENCY argument is an enumerated type (ValidationStringency), " + "which can have one of the following values: [STRICT, LENIENT, SILENT]", + ), + ToolInput( + "verbosity", + String(optional=True), + prefix="--verbosity", + position=11, + doc="The --verbosity argument is an enumerated type (LogLevel), which can have " + "one of the following values: [ERROR, WARNING, INFO, DEBUG]", + ), + ] diff --git a/janis_bioinformatics/tools/gatk4/fastqtosam/versions.py b/janis_bioinformatics/tools/gatk4/fastqtosam/versions.py new file mode 100644 index 000000000..67ee4e423 --- /dev/null +++ b/janis_bioinformatics/tools/gatk4/fastqtosam/versions.py @@ -0,0 +1,24 @@ +from .base import Gatk4FastqToSamBase +from ..versions import Gatk_4_0_12, Gatk_4_1_2_0, Gatk_4_1_3_0, Gatk_4_1_4_0, Gatk_4_1_4_1 + + +class Gatk4FastqToSam_4_0(Gatk_4_0_12, Gatk4FastqToSamBase): + pass + + +class Gatk4FastqToSam_4_1_2(Gatk_4_1_2_0, Gatk4FastqToSamBase): + pass + + +class Gatk4FastqToSam_4_1_3(Gatk_4_1_3_0, Gatk4FastqToSamBase): + pass + + +class Gatk4FastqToSam_4_1_4(Gatk_4_1_4_1, Gatk4FastqToSamBase): + pass + + +Gatk4FastqToSamLatest = Gatk4FastqToSam_4_1_4 + +if __name__ == "__main__": + print(Gatk4FastqToSamLatest().help()) diff --git a/janis_bioinformatics/tools/gatk4/mergebamalignment/__init__.py b/janis_bioinformatics/tools/gatk4/mergebamalignment/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/janis_bioinformatics/tools/gatk4/mergebamalignment/base.py b/janis_bioinformatics/tools/gatk4/mergebamalignment/base.py new file mode 100644 index 000000000..70b15c54f --- /dev/null +++ b/janis_bioinformatics/tools/gatk4/mergebamalignment/base.py @@ -0,0 +1,390 @@ +from abc import ABC +from typing import Dict, Any + +from janis_core import ( + ToolInput, + Filename, + String, + ToolOutput, + Array, + File, + Int, + Boolean, + InputSelector, + CaptureType, +) +from janis_core import get_value_for_hints_and_ordered_resource_tuple +from janis_core import ToolMetadata + +from janis_bioinformatics.data_types import FastaWithDict, BamBai +from ..gatk4toolbase import Gatk4ToolBase + +CORES_TUPLE = [ + ( + CaptureType.key(), + { + CaptureType.CHROMOSOME: 1, + CaptureType.EXOME: 1, + CaptureType.THIRTYX: 1, + CaptureType.NINETYX: 1, + CaptureType.THREEHUNDREDX: 1, + }, + ) +] + +MEM_TUPLE = [ + ( + CaptureType.key(), + { + CaptureType.CHROMOSOME: 16, + CaptureType.EXOME: 16, + CaptureType.THIRTYX: 16, + CaptureType.NINETYX: 32, + CaptureType.THREEHUNDREDX: 32, + }, + ) +] + + +class Gatk4MergeBamAlignmentBase(Gatk4ToolBase, ABC): + @classmethod + def gatk_command(cls): + return "MergeBamAlignment" + + def tool(self): + return "Gatk4MergeBamAlignment" + + def friendly_name(self): + return "GATK4: Merge SAM or BAM with unmapped BAM file" + + def cpus(self, hints: Dict[str, Any]): + val = get_value_for_hints_and_ordered_resource_tuple( + hints, CORES_TUPLE) + if val: + return val + return 1 + + def memory(self, hints: Dict[str, Any]): + val = get_value_for_hints_and_ordered_resource_tuple(hints, MEM_TUPLE) + if val: + return val + return 4 + + def inputs(self): + return [ + ToolInput( + "ubam", + BamBai(), + prefix="--UNMAPPED_BAM", + prefix_applies_to_all_elements=True, + doc="Original SAM or BAM file of unmapped reads, which must be in queryname order.", + position=10, + ), + ToolInput( + "bam", + Array(BamBai()), + prefix="--ALIGNED_BAM", + prefix_applies_to_all_elements=True, + doc="SAM or BAM file(s) with alignment data.", + position=10, + ), + ToolInput( + "reference", + FastaWithDict(optional=True), + prefix="--REFERENCE_SEQUENCE", + position=10, + doc="Reference sequence file.", + ), + ToolInput( + "outputFilename", + Filename(extension=".bam"), + position=10, + prefix="--OUTPUT", + doc="Merged SAM or BAM file to write to.", + ), + *self.additional_args, + ] + + def outputs(self): + return [ + ToolOutput( + "out", + BamBai(), + glob=InputSelector("outputFilename"), + secondaries_present_as={".bai": "^.bai"}, + ) + ] + + def bind_metadata(self): + from datetime import date + + return ToolMetadata( + contributors=[ + "Michael Franklin (@illisional)", "Matthias De Smet(@matthdsm)" + ], + dateCreated=date(2018, 12, 24), + dateUpdated=date(2020, 2, 26), + institution="Broad Institute", + doi=None, + citation="See https://software.broadinstitute.org/gatk/documentation/article?id=11027 for more information", + keywords=["gatk", "gatk4", "broad", "merge", "sam"], + documentationUrl="https://gatk.broadinstitute.org/hc/en-us/articles/360037225832-MergeBamAlignment-Picard-", + documentation="Merges SAM/BAM file with an unmapped BAM file", + ) + + additional_args = [ + ToolInput( + "addMateCigar", + Boolean(optional=True), + prefix="--ADD_MATE_CIGAR", + position=11, + doc="Adds the mate CIGAR tag (MC)" + ), + ToolInput( + "alignedReadsOnly", + Boolean(optional=True), + prefix="--ALIGNED_READS_ONLY", + position=11, + doc="Whether to output only aligned reads." + ), + ToolInput( + "alignerProperPairFlags", + Boolean(optional=True), + prefix="--ALIGNER_PROPER_PAIR_FLAGS", + position=11, + doc="Use the aligner's idea of what a proper pair is rather than computing in this program." + ), + ToolInput( + "argumentsFile", + Array(File(), optional=True), + prefix="--arguments_file", + position=11, + doc="read one or more arguments files and add them to the command line", + ), + ToolInput( + "attributesToRemove", + Array(String(), optional=True), + prefix="--ATTRIBUTES_TO_REMOVE", + position=11, + doc="Attributes from the alignment record that should be removed when merging.", + ), + ToolInput( + "attributesToRetain", + Array(String(), optional=True), + prefix="--ATTRIBUTES_TO_RETAIN", + position=11, + doc="Reserved alignment attributes (tags starting with X, Y, or Z) that should be brought over from the alignment data when merging.", + ), + ToolInput( + "attributesToReverse", + Array(String(), optional=True), + prefix="--ATTRIBUTES_TO_REVERSE", + position=11, + doc="Attributes on negative strand reads that need to be reversed.", + ), + ToolInput( + "attributesToReverseComplement", + Array(String(), optional=True), + prefix="--ATTRIBUTES_TO_REVERSE_COMPLEMENT", + position=11, + doc="Attributes on negative strand reads that need to be reverse complemented.", + ), + ToolInput( + "clipAdapter", + Boolean(optional=True), + prefix="--CLIP_ADAPTERS", + position=11, + doc="Whether to clip adapters where identified." + ), + ToolInput( + "clipOverlappingReads", + Boolean(optional=True), + prefix="--CLIP_OVERLAPPING_READS", + position=11, + doc="For paired reads, soft clip the 3' end of each read if necessary so that it does not extend past the 5' end of its mate." + ), + ToolInput( + "expectedOrientations", + Array(String(), optional=True), + prefix="--EXPECTED_ORIENTATIONS", + position=11, + doc="The expected orientation of proper read pairs.", + ), + ToolInput( + "includeSecondaryAlginments", + Boolean(optional=True), + prefix="--INCLUDE_SECONDARY_ALIGNMENTS", + position=11, + doc="If false, do not write secondary alignments to output." + ), + ToolInput( + "isBisulfiteSequencing", + Boolean(optional=True), + prefix="--IS_BISULFITE_SEQUENCE", + position=11, + doc="Whether the lane is bisulfite sequence (used when calculating the NM tag)." + ), + ToolInput( + "matchingDictionaryTags", + Array(String(), optional=True), + prefix="--MATCHING_DICTIONARY_TAGS", + position=11, + doc="List of Sequence Records tags that must be equal (if present) in the reference dictionary and in the aligned file.", + ), + ToolInput( + "maxInsertionsOrDeletions", + Int(optional=True), + prefix="--MAX_INSERTIONS_OR_DELETIONS", + position=11, + doc="The maximum number of insertions or deletions permitted for an alignment to be included." + ), + ToolInput( + "minUnclippedBases", + Int(optional=True), + prefix="--MIN_UNCLIPPED_BASES", + position=11, + doc="If UNMAP_CONTAMINANT_READS is set, require this many unclipped bases or else the read will be marked as contaminant." + ), + ToolInput( + "primaryAlignmentStrategy", + Int(optional=True), + prefix="--PRIMARY_ALIGNMENT_STRATEGY", + position=11, + doc="Strategy for selecting primary alignment when the aligner has provided more than one alignment for a pair or fragment, and none are marked as primary, more than one is marked as primary, or the primary alignment is filtered out for some reason." + ), + ToolInput( + "programGroupCommandLine", + String(optional=True), + prefix="--PROGRAM_GROUP_COMMAND_LINE", + position=11, + doc="The command line of the program group." + ), + ToolInput( + "programGroupName", + String(optional=True), + prefix="--PROGRAM_GROUP_NAME", + position=11, + doc="The name of the program group." + ), + ToolInput( + "programGroupVersion", + String(optional=True), + prefix="--PROGRAM_GROUP_VERSION", + position=11, + doc="The version of the program group." + ), + ToolInput( + "programRecordId", + String(optional=True), + prefix="--PROGRAM_RECORD_ID", + position=11, + doc="The program group ID of the aligner." + ), + ToolInput( + "sortOrder", + String(optional=True), + prefix="-SO", + position=10, + doc="The --SORT_ORDER argument is an enumerated type (SortOrder), which can have one of " + "the following values: [unsorted, queryname, coordinate, duplicate, unknown]", + ), + ToolInput( + "unmapContaminantReads", + Boolean(optional=True), + prefix="--UNMAP_CONTAMINANT_READS", + position=11, + doc="Detect reads originating from foreign organisms (e.g. bacterial DNA in a non-bacterial sample),and unmap + label those reads accordingly." + ), + ToolInput( + "unmappedReadStrategy", + String(optional=True), + prefix="--UNMAPPED_READ_STRATEGY", + position=11, + doc="How to deal with alignment information in reads that are being unmapped (e.g. due to cross-species contamination.) Currently ignored unless UNMAP_CONTAMINANT_READS = true." + ), + ToolInput( + "addPgTagToReads", + Boolean(optional=True), + prefix="--ADD_PG_TAG_TO_READS", + position=11, + doc="Add PG tag to each read in a SAM or BAM" + ), + ToolInput( + "compressionLevel", + Int(optional=True), + prefix="--COMPRESSION_LEVEL", + position=11, + doc="Compression level for all compressed files created (e.g. BAM and GELI).", + ), + ToolInput( + "createIndex", + Boolean(optional=True), + prefix="--CREATE_INDEX", + position=11, + doc="Whether to create a BAM index when writing a coordinate-sorted BAM file.", + ), + ToolInput( + "createMd5File", + Boolean(optional=True), + prefix="--CREATE_MD5_FILE", + position=11, + doc="Whether to create an MD5 digest for any BAM or FASTQ files created.", + ), + ToolInput( + "maxRecordsInRam", + Int(optional=True), + prefix="--MAX_RECORDS_IN_RAM", + position=11, + doc="When writing SAM files that need to be sorted, this will specify the number of " + "records stored in RAM before spilling to disk. Increasing this number reduces " + "the number of file handles needed to sort a SAM file, and increases the amount of RAM needed.", + ), + ToolInput( + "quiet", + Boolean(optional=True), + prefix="--QUIET", + position=11, + doc="Whether to suppress job-summary info on System.err.", + ), + ToolInput( + "tmpDir", + String(optional=True), + prefix="--TMP_DIR", + position=11, + default="/tmp/", + doc="Undocumented option", + ), + ToolInput( + "useJdkDeflater", + Boolean(optional=True), + prefix="--use_jdk_deflater", + position=11, + doc="Whether to use the JdkDeflater (as opposed to IntelDeflater)", + ), + ToolInput( + "useJdkInflater", + Boolean(optional=True), + prefix="--use_jdk_inflater", + position=11, + doc="Whether to use the JdkInflater (as opposed to IntelInflater)", + ), + ToolInput( + "validationStringency", + String(optional=True), + prefix="--VALIDATION_STRINGENCY", + position=11, + doc="Validation stringency for all SAM files read by this program. Setting stringency to SILENT " + "can improve performance when processing a BAM file in which variable-length data " + "(read, qualities, tags) do not otherwise need to be decoded." + "The --VALIDATION_STRINGENCY argument is an enumerated type (ValidationStringency), " + "which can have one of the following values: [STRICT, LENIENT, SILENT]", + ), + ToolInput( + "verbosity", + String(optional=True), + prefix="--verbosity", + position=11, + doc="The --verbosity argument is an enumerated type (LogLevel), which can have " + "one of the following values: [ERROR, WARNING, INFO, DEBUG]", + ), + ] diff --git a/janis_bioinformatics/tools/gatk4/mergebamalignment/versions.py b/janis_bioinformatics/tools/gatk4/mergebamalignment/versions.py new file mode 100644 index 000000000..2099c78ea --- /dev/null +++ b/janis_bioinformatics/tools/gatk4/mergebamalignment/versions.py @@ -0,0 +1,24 @@ +from .base import Gatk4MergeBamAlignmentBase +from ..versions import Gatk_4_0_12, Gatk_4_1_2_0, Gatk_4_1_3_0, Gatk_4_1_4_0, Gatk_4_1_4_1 + + +class Gatk4MergeBamAlignment_4_0(Gatk_4_0_12, Gatk4MergeBamAlignmentBase): + pass + + +class Gatk4MergeBamAlignment_4_1_2(Gatk_4_1_2_0, Gatk4MergeBamAlignmentBase): + pass + + +class Gatk4MergeBamAlignment_4_1_3(Gatk_4_1_3_0, Gatk4MergeBamAlignmentBase): + pass + + +class Gatk4MergeBamAlignment_4_1_4(Gatk_4_1_4_1, Gatk4MergeBamAlignmentBase): + pass + + +Gatk4MergeBamAlignmentLatest = Gatk4MergeBamAlignment_4_1_4 + +if __name__ == "__main__": + print(Gatk4MergeBamAlignmentLatest().help()) diff --git a/janis_bioinformatics/tools/gatk4/versions.py b/janis_bioinformatics/tools/gatk4/versions.py index a807821d2..913d2cb14 100644 --- a/janis_bioinformatics/tools/gatk4/versions.py +++ b/janis_bioinformatics/tools/gatk4/versions.py @@ -33,4 +33,18 @@ def version(self): return "4.1.4.0" -Gatk4Latest = Gatk_4_1_4_0 +class Gatk_4_1_4_1(ABC): + def container(self): + return "broadinstitute/gatk:4.1.4.1" + + def version(self): + return "4.1.4.1" + +class Gatk_4_1_5_0(ABC): + def container(self): + return "broadinstitute/gatk:4.1.5.0" + + def version(self): + return "4.1.5.0" + +Gatk4Latest = Gatk_4_1_5_0 diff --git a/janis_bioinformatics/tools/io_lib/__init__.py b/janis_bioinformatics/tools/io_lib/__init__.py new file mode 100644 index 000000000..9603e5a48 --- /dev/null +++ b/janis_bioinformatics/tools/io_lib/__init__.py @@ -0,0 +1 @@ +from .scramble.versions import * diff --git a/janis_bioinformatics/tools/io_lib/scramble/__init__.py b/janis_bioinformatics/tools/io_lib/scramble/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/janis_bioinformatics/tools/io_lib/scramble/base.py b/janis_bioinformatics/tools/io_lib/scramble/base.py new file mode 100644 index 000000000..2768cda82 --- /dev/null +++ b/janis_bioinformatics/tools/io_lib/scramble/base.py @@ -0,0 +1,227 @@ +from abc import ABC +from typing import Any, Dict + +from janis_bioinformatics.data_types import Bam, Cram, FastaFai, FastqGzPair, File +from janis_bioinformatics.tools.bioinformaticstoolbase import BioinformaticsTool +from janis_core import ( + Boolean, + CaptureType, + CpuSelector, + Filename, + Float, + InputSelector, + Int, + Stdout, + String, + ToolArgument, + ToolInput, + ToolMetadata, + ToolOutput, + get_value_for_hints_and_ordered_resource_tuple, +) + +SCRAMBLE_MEM_TUPLE = [ + ( + CaptureType.key(), + { + CaptureType.TARGETED: 8, + CaptureType.EXOME: 12, + CaptureType.CHROMOSOME: 12, + CaptureType.THIRTYX: 16, + CaptureType.NINETYX: 20, + CaptureType.THREEHUNDREDX: 24, + }, + ) +] + +SCRAMBLE_CORES_TUPLE = [ + ( + CaptureType.key(), + { + CaptureType.TARGETED: 16, + CaptureType.EXOME: 20, + CaptureType.CHROMOSOME: 24, + CaptureType.THIRTYX: 30, + CaptureType.NINETYX: 32, + CaptureType.THREEHUNDREDX: 32, + }, + ) +] + + +class ScrambleBase(BioinformaticsTool, ABC): + def tool(self): + return "scramble" + + def friendly_name(self): + return "scramble" + + def tool_provider(self): + return "io_lib" + + def base_command(self): + return ["scramble"] + + def inputs(self): + return [ + ToolInput("inputFilename", Bam(), position=200), + ToolInput( + "reference", FastaFai(), prefix="-r", doc="Reference sequence file.", + ), + ToolInput("outputFilename", Filename(extension=".bam")), + *ScrambleBase.additional_inputs, + ] + + def arguments(self): + return [ + ToolArgument("bam", prefix="-I", doc="input data format",), + ToolArgument("cram", prefix="-O", doc="output data format",), + ToolArgument( + "-9", + doc="compression settings for output cram file (-1=fast,-9=best)", + ), + ToolArgument( + "3.0", + prefix="-V", + separate_value_from_prefix=False, + doc="Cram version to output", + ), + ] + + def outputs(self): + return [ + ToolOutput( + "out", Stdout(Cram(), stdoutname=InputSelector("outputFilename")) + ) + ] + + def memory(self, hints: Dict[str, Any]): + val = get_value_for_hints_and_ordered_resource_tuple(hints, SCRAMBLE_MEM_TUPLE) + if val: + return val + return 16 + + def cpus(self, hints: Dict[str, Any]): + val = get_value_for_hints_and_ordered_resource_tuple( + hints, SCRAMBLE_CORES_TUPLE + ) + if val: + return val + return 4 + + def bind_metadata(self): + from datetime import date + + return ToolMetadata( + contributors=["Matthias De Smet (@mattdsm)"], + dateCreated=date(2020, 2, 27), + dateUpdated=date(2020, 2, 27), + institution="None", + doi=None, + keywords=["bam", "cram", "compression"], + documentationUrl="https://github.com/jkbonfield/io_lib/", + documentation="scramble: streaming bam to cram compression", + ) + + additional_inputs = [ + ToolInput( + "range", + String(optional=True), + prefix="-R", + doc="Specifies the refseq:start-end range", + ), + ToolInput( + "maxBases", + Int(optional=True), + prefix="-b", + default=5000000, + doc="Max. bases per slice, default 5000000.", + ), + ToolInput( + "maxSequences", + Int(optional=True), + prefix="-s", + default=10000, + doc="Sequences per slice, default 10000.", + ), + ToolInput( + "maxSlicesPerContainer", + Int(optional=True), + prefix="-S", + default=1, + doc="Slices per container, default 1.", + ), + ToolInput( + "embedReferenceSeuence", + Boolean(optional=True), + prefix="-e", + doc="Embed reference sequence.", + ), + ToolInput( + "nonReferenceBaseEncoding", + Boolean(optional=True), + prefix="-x", + doc="Non-reference based encoding.", + ), + ToolInput( + "multipleReferencesPerSlice", + Boolean(optional=True), + prefix="-M", + doc="Use multiple references per slice.", + ), + ToolInput( + "generateTags", + Boolean(optional=True), + prefix="-m", + doc="Generate MD and NM tags.", + ), + ToolInput( + "lzmaCompression", + Boolean(optional=True), + prefix="-Z", + doc="Also compress using lzma", + ), + ToolInput( + "discardReadNames", + Boolean(optional=True), + prefix="-n", + doc="Discard read names where possible.", + ), + ToolInput( + "preserveAuxTags", + Boolean(optional=True), + prefix="-P", + doc="Preserve all aux tags (incl RG,NM,MD).", + ), + ToolInput( + "preserveAuxTagSizes", + Boolean(optional=True), + prefix="-p", + doc="Preserve aux tag sizes ('i', 's', 'c').", + ), + ToolInput( + "noAddPG", + Boolean(optional=True), + prefix="-q", + doc="Don't add scramble @PG header line.", + ), + ToolInput( + "decodeStop", + Int(optional=True), + prefix="-N", + doc="Stop decoding after 'integer' sequences.", + ), + ToolInput( + "threads", + Int(optional=True), + default=CpuSelector(), + prefix="-t", + doc="Number of threads. (default = 1)", + ), + ToolInput( + "enableQualityBinning", + Int(optional=True), + prefix="-B", + doc="Enable Illumina 8 quality-binning system (lossy).", + ), + ] diff --git a/janis_bioinformatics/tools/io_lib/scramble/versions.py b/janis_bioinformatics/tools/io_lib/scramble/versions.py new file mode 100644 index 000000000..fe191c4ce --- /dev/null +++ b/janis_bioinformatics/tools/io_lib/scramble/versions.py @@ -0,0 +1,13 @@ +from ..versions import ioLib_1_14_1_2 +from .base import ScrambleBase + + +class Scramble_1_14_1_2(ioLib_1_14_1_2, ScrambleBase): + pass + + +ScrambleLatest = Scramble_1_14_1_2 + + +if __name__ == "__main__": + print(ScrambleLatest().help()) diff --git a/janis_bioinformatics/tools/io_lib/versions.py b/janis_bioinformatics/tools/io_lib/versions.py new file mode 100644 index 000000000..0d30afc93 --- /dev/null +++ b/janis_bioinformatics/tools/io_lib/versions.py @@ -0,0 +1,12 @@ +from abc import ABC + + +class ioLib_1_14_1_2(ABC): + def container(self): + return "quay.io/biocontainers/staden_io_lib:1.14.12--h244ad75_0" + + def version(self): + return "1.14.12" + + +ioLibLatest = ioLib_1_14_1_2