From 9be81c431ed4fc2c5c23ffe46cc22d1fc2b5ef8a Mon Sep 17 00:00:00 2001 From: mauricio Date: Mon, 3 Jun 2024 16:48:29 +0800 Subject: [PATCH 01/12] added optional parameters to meta.yml --- bio/reference/ensembl-annotation/meta.yaml | 2 ++ bio/reference/ensembl-sequence/meta.yaml | 2 ++ 2 files changed, 4 insertions(+) diff --git a/bio/reference/ensembl-annotation/meta.yaml b/bio/reference/ensembl-annotation/meta.yaml index b8fd0924a79..1c649e15bb3 100644 --- a/bio/reference/ensembl-annotation/meta.yaml +++ b/bio/reference/ensembl-annotation/meta.yaml @@ -6,3 +6,5 @@ output: - Ensemble GTF or GFF3 anotation file params: - url: URL from where to download cache data (optional; by default is ``ftp://ftp.ensembl.org/pub``) + - branch: branch of ftp server to download cache data if required (optional; e.g. "plants") + - collection: collection of ftp server to download cache data if required (optional; e.g. "bacteria_0_collection") \ No newline at end of file diff --git a/bio/reference/ensembl-sequence/meta.yaml b/bio/reference/ensembl-sequence/meta.yaml index 20c769a0d1d..8b703c9b2bc 100644 --- a/bio/reference/ensembl-sequence/meta.yaml +++ b/bio/reference/ensembl-sequence/meta.yaml @@ -6,3 +6,5 @@ output: - fasta file params: - url: URL from where to download cache data (optional; by default is ``ftp://ftp.ensembl.org/pub``) + - branch: branch of ftp server to download cache data if required (optional; e.g. "plants") + - collection: collection of ftp server to download cache data if required (optional; e.g. "bacteria_0_collection") \ No newline at end of file From bff91b657fbe5d9bc26211dec1296b4287b9f54d Mon Sep 17 00:00:00 2001 From: mauricio Date: Mon, 3 Jun 2024 16:50:16 +0800 Subject: [PATCH 02/12] added collections optional parameter to ensembl-sequence, ensembl-annotation --- bio/reference/ensembl-annotation/test/Snakefile | 3 ++- bio/reference/ensembl-annotation/wrapper.py | 5 ++++- bio/reference/ensembl-sequence/test/Snakefile | 3 ++- bio/reference/ensembl-sequence/wrapper.py | 17 +++++++++++++---- 4 files changed, 21 insertions(+), 7 deletions(-) diff --git a/bio/reference/ensembl-annotation/test/Snakefile b/bio/reference/ensembl-annotation/test/Snakefile index 3a30ca70bde..e3b777162f0 100644 --- a/bio/reference/ensembl-annotation/test/Snakefile +++ b/bio/reference/ensembl-annotation/test/Snakefile @@ -22,7 +22,8 @@ rule get_annotation_gz: release="105", build="GRCh37", flavor="", # optional, e.g. chr_patch_hapl_scaff, see Ensembl FTP. - # branch="plants", # optional: specify branch + # branch="plants" or "bacteria", # optional: specify branch + # collection="bacteria_0_collection", # optional: specify collection log: "logs/get_annotation.log", params: diff --git a/bio/reference/ensembl-annotation/wrapper.py b/bio/reference/ensembl-annotation/wrapper.py index c3d655cbb25..7c8b6207b27 100644 --- a/bio/reference/ensembl-annotation/wrapper.py +++ b/bio/reference/ensembl-annotation/wrapper.py @@ -31,6 +31,9 @@ elif snakemake.params.get("branch"): branch = snakemake.params.branch + "/" +collection = "" +if snakemake.params.get("collection"): + collection = snakemake.params.collection + "/" flavor = snakemake.params.get("flavor", "") if flavor: @@ -49,7 +52,7 @@ url = snakemake.params.get("url", "ftp://ftp.ensembl.org/pub") -url = f"{url}/{branch}release-{release}/{out_fmt}/{species}/{species.capitalize()}.{build}.{gtf_release}.{flavor}{suffix}" +url = f"{url}/{branch}release-{release}/{out_fmt}/{collection}{species}/{species.capitalize()}.{build}.{gtf_release}.{flavor}{suffix}" try: diff --git a/bio/reference/ensembl-sequence/test/Snakefile b/bio/reference/ensembl-sequence/test/Snakefile index fec1c746a4c..034814648f6 100644 --- a/bio/reference/ensembl-sequence/test/Snakefile +++ b/bio/reference/ensembl-sequence/test/Snakefile @@ -40,7 +40,8 @@ rule get_multiple_chromosome: build="R64-1-1", release="101", chromosome=["I", "II"], # optional: restrict to one or multiple chromosomes - # branch="plants", # optional: specify branch + # branch="bacteria", # optional: specify branch + # collection="bacteria_0_collection", # optional: specify collection log: "logs/get_genome.log", cache: "omit-software" # save space and time with between workflow caching (see docs) diff --git a/bio/reference/ensembl-sequence/wrapper.py b/bio/reference/ensembl-sequence/wrapper.py index cb2956a6c04..b53e6439de4 100644 --- a/bio/reference/ensembl-sequence/wrapper.py +++ b/bio/reference/ensembl-sequence/wrapper.py @@ -19,11 +19,20 @@ elif snakemake.params.get("branch"): branch = snakemake.params.branch + "/" +collection = "" +if snakemake.params.get("collection"): + collection = snakemake.params.collection + "/" + log = snakemake.log_fmt_shell(stdout=False, stderr=True) -spec = ("{build}" if int(release) > 75 else "{build}.{release}").format( - build=build, release=release -) +if branch=="" or branch == "grch37/": + spec = ("{build}" if int(release) > 75 else "{build}.{release}").format( + build=build, release=release + ) +else: + spec = ("{build}" if int(release) > 30 else "{build}.{release}").format( + build=build, release=release + ) suffixes = "" datatype = snakemake.params.get("datatype", "") @@ -52,7 +61,7 @@ url = snakemake.params.get("url", "ftp://ftp.ensembl.org/pub") spec = spec.format(build=build, release=release) -url_prefix = f"{url}/{branch}release-{release}/fasta/{species}/{datatype}/{species.capitalize()}.{spec}" +url_prefix = f"{url}/{branch}release-{release}/fasta/{collection}{species}/{datatype}/{species.capitalize()}.{spec}" success = False for suffix in suffixes: From b8ec4f9a5a5dee97201c7bf6110122817d1de25c Mon Sep 17 00:00:00 2001 From: mauricio Date: Mon, 3 Jun 2024 17:11:29 +0800 Subject: [PATCH 03/12] updated Snakefiles with test cases for off branch references --- .../ensembl-annotation/test/Snakefile | 20 ++++++++++++---- bio/reference/ensembl-sequence/test/Snakefile | 23 +++++++++++++++---- 2 files changed, 34 insertions(+), 9 deletions(-) diff --git a/bio/reference/ensembl-annotation/test/Snakefile b/bio/reference/ensembl-annotation/test/Snakefile index e3b777162f0..09808eaf033 100644 --- a/bio/reference/ensembl-annotation/test/Snakefile +++ b/bio/reference/ensembl-annotation/test/Snakefile @@ -6,7 +6,6 @@ rule get_annotation: release="105", build="GRCh37", flavor="", # optional, e.g. chr_patch_hapl_scaff, see Ensembl FTP. - # branch="plants", # optional: specify branch log: "logs/get_annotation.log", cache: "omit-software" # save space and time with between workflow caching (see docs) @@ -22,12 +21,25 @@ rule get_annotation_gz: release="105", build="GRCh37", flavor="", # optional, e.g. chr_patch_hapl_scaff, see Ensembl FTP. - # branch="plants" or "bacteria", # optional: specify branch - # collection="bacteria_0_collection", # optional: specify collection log: "logs/get_annotation.log", + cache: "omit-software" # save space and time with between workflow caching (see docs) + wrapper: + "master/bio/reference/ensembl-annotation" + + +rule get_off_branch_annotation: + output: + "refs/annotation.gtf", params: - url="http://ftp.ensembl.org/pub", + species="bacillus_subtilis_subsp_subtilis_str_168_gca_000009045", + release="59", # note latest release varies with url + build="ASM904v1", + branch="bacteria", # optional for off branch genomes + url="ftp://ftp.ensemblgenomes.org/pub/", # optional set ftp server source + collection="bacteria_0_collection", # optional set collection source for genome + log: + "logs/get_annotation.log", cache: "omit-software" # save space and time with between workflow caching (see docs) wrapper: "master/bio/reference/ensembl-annotation" diff --git a/bio/reference/ensembl-sequence/test/Snakefile b/bio/reference/ensembl-sequence/test/Snakefile index 034814648f6..4c263cbf78e 100644 --- a/bio/reference/ensembl-sequence/test/Snakefile +++ b/bio/reference/ensembl-sequence/test/Snakefile @@ -22,11 +22,8 @@ rule get_single_chromosome: build="R64-1-1", release="101", chromosome=["II"], # optional: restrict to one or multiple chromosomes, for multiple see below - # branch="plants", # optional: specify branch log: "logs/get_genome.log", - params: - url="http://ftp.ensembl.org/pub", cache: "omit-software" # save space and time with between workflow caching (see docs) wrapper: "master/bio/reference/ensembl-sequence" @@ -40,8 +37,24 @@ rule get_multiple_chromosome: build="R64-1-1", release="101", chromosome=["I", "II"], # optional: restrict to one or multiple chromosomes - # branch="bacteria", # optional: specify branch - # collection="bacteria_0_collection", # optional: specify collection + log: + "logs/get_genome.log", + cache: "omit-software" # save space and time with between workflow caching (see docs) + wrapper: + "master/bio/reference/ensembl-sequence" + + +rule get_off_branch_genome: + output: + "refs/genome.fasta", + params: + species="bacillus_subtilis_subsp_subtilis_str_168_gca_000009045", + datatype="dna", + build="ASM904v1", + release="59", # note latest release varies with url + branch="bacteria", # optional for off branch genomes + url="ftp://ftp.ensemblgenomes.org/pub/", # optional set ftp server source + collection="bacteria_0_collection", # optional set collection source for genome log: "logs/get_genome.log", cache: "omit-software" # save space and time with between workflow caching (see docs) From ff4b80341bab01aa1dc3e5047d11b085bdaeb7f1 Mon Sep 17 00:00:00 2001 From: mauricio Date: Mon, 3 Jun 2024 17:52:51 +0800 Subject: [PATCH 04/12] linted and prep for testing --- bio/reference/ensembl-annotation/test/Snakefile | 2 +- bio/reference/ensembl-sequence/test/Snakefile | 2 +- bio/reference/ensembl-sequence/wrapper.py | 2 +- test.py | 16 ++++++++++++++++ 4 files changed, 19 insertions(+), 3 deletions(-) diff --git a/bio/reference/ensembl-annotation/test/Snakefile b/bio/reference/ensembl-annotation/test/Snakefile index 09808eaf033..28e24cde69e 100644 --- a/bio/reference/ensembl-annotation/test/Snakefile +++ b/bio/reference/ensembl-annotation/test/Snakefile @@ -30,7 +30,7 @@ rule get_annotation_gz: rule get_off_branch_annotation: output: - "refs/annotation.gtf", + "refs/off_branch_annotation.gtf", params: species="bacillus_subtilis_subsp_subtilis_str_168_gca_000009045", release="59", # note latest release varies with url diff --git a/bio/reference/ensembl-sequence/test/Snakefile b/bio/reference/ensembl-sequence/test/Snakefile index 4c263cbf78e..d8af8085fd1 100644 --- a/bio/reference/ensembl-sequence/test/Snakefile +++ b/bio/reference/ensembl-sequence/test/Snakefile @@ -46,7 +46,7 @@ rule get_multiple_chromosome: rule get_off_branch_genome: output: - "refs/genome.fasta", + "refs/off_branch_genome.fasta", params: species="bacillus_subtilis_subsp_subtilis_str_168_gca_000009045", datatype="dna", diff --git a/bio/reference/ensembl-sequence/wrapper.py b/bio/reference/ensembl-sequence/wrapper.py index b53e6439de4..6eb9116235e 100644 --- a/bio/reference/ensembl-sequence/wrapper.py +++ b/bio/reference/ensembl-sequence/wrapper.py @@ -25,7 +25,7 @@ log = snakemake.log_fmt_shell(stdout=False, stderr=True) -if branch=="" or branch == "grch37/": +if branch == "" or branch == "grch37/": spec = ("{build}" if int(release) > 75 else "{build}.{release}").format( build=build, release=release ) diff --git a/test.py b/test.py index 8ab245518f1..36642647644 100644 --- a/test.py +++ b/test.py @@ -5564,6 +5564,14 @@ def test_ensembl_sequence_chromosomes(): ) +@skip_if_not_modified +def test_ensembl_sequence_off_branch(): + run( + "bio/reference/ensembl-sequence", + ["snakemake", "--cores", "1", "refs/off_branch_genome.fasta", "--use-conda", "-F"], + ) + + @skip_if_not_modified def test_ensembl_sequence_chromosome_old_release(): run( @@ -5597,6 +5605,14 @@ def test_ensembl_annotation_gtf_gz(): ) +@skip_if_not_modified +def test_ensembl_off_branch_annotation_gtf(): + run( + "bio/reference/ensembl-annotation", + ["snakemake", "--cores", "1", "refs/off_branch_annotation.gtf", "--use-conda", "-F"], + ) + + @skip_if_not_modified def test_ensembl_variation(): run( From 8d687fa8073755145ab19413584b0948e5aaf88a Mon Sep 17 00:00:00 2001 From: Mauricio Rodriguez G Date: Tue, 11 Jun 2024 17:49:24 +0800 Subject: [PATCH 05/12] Update bio/reference/ensembl-annotation/test/Snakefile Co-authored-by: Filipe G. Vieira <1151762+fgvieira@users.noreply.github.com> --- bio/reference/ensembl-annotation/test/Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bio/reference/ensembl-annotation/test/Snakefile b/bio/reference/ensembl-annotation/test/Snakefile index 28e24cde69e..a340cdc5a6a 100644 --- a/bio/reference/ensembl-annotation/test/Snakefile +++ b/bio/reference/ensembl-annotation/test/Snakefile @@ -39,7 +39,7 @@ rule get_off_branch_annotation: url="ftp://ftp.ensemblgenomes.org/pub/", # optional set ftp server source collection="bacteria_0_collection", # optional set collection source for genome log: - "logs/get_annotation.log", + "logs/get_off_branch_annotation.log", cache: "omit-software" # save space and time with between workflow caching (see docs) wrapper: "master/bio/reference/ensembl-annotation" From 5bbd3375d230bb9c05584b67fdcebc9a6c2acb33 Mon Sep 17 00:00:00 2001 From: Mauricio Rodriguez G Date: Tue, 11 Jun 2024 17:49:57 +0800 Subject: [PATCH 06/12] Update bio/reference/ensembl-annotation/test/Snakefile Co-authored-by: Filipe G. Vieira <1151762+fgvieira@users.noreply.github.com> --- bio/reference/ensembl-annotation/test/Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bio/reference/ensembl-annotation/test/Snakefile b/bio/reference/ensembl-annotation/test/Snakefile index a340cdc5a6a..b97cdb86ab0 100644 --- a/bio/reference/ensembl-annotation/test/Snakefile +++ b/bio/reference/ensembl-annotation/test/Snakefile @@ -22,7 +22,7 @@ rule get_annotation_gz: build="GRCh37", flavor="", # optional, e.g. chr_patch_hapl_scaff, see Ensembl FTP. log: - "logs/get_annotation.log", + "logs/get_annotation_gz.log", cache: "omit-software" # save space and time with between workflow caching (see docs) wrapper: "master/bio/reference/ensembl-annotation" From b098ab1ffe0589a6dd32da403fe5ac5a2bdee4ce Mon Sep 17 00:00:00 2001 From: Mauricio Rodriguez G Date: Tue, 11 Jun 2024 17:50:03 +0800 Subject: [PATCH 07/12] Update bio/reference/ensembl-sequence/test/Snakefile Co-authored-by: Filipe G. Vieira <1151762+fgvieira@users.noreply.github.com> --- bio/reference/ensembl-sequence/test/Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bio/reference/ensembl-sequence/test/Snakefile b/bio/reference/ensembl-sequence/test/Snakefile index d8af8085fd1..9e25a003c8d 100644 --- a/bio/reference/ensembl-sequence/test/Snakefile +++ b/bio/reference/ensembl-sequence/test/Snakefile @@ -56,7 +56,7 @@ rule get_off_branch_genome: url="ftp://ftp.ensemblgenomes.org/pub/", # optional set ftp server source collection="bacteria_0_collection", # optional set collection source for genome log: - "logs/get_genome.log", + "logs/get_off_branch_genome.log", cache: "omit-software" # save space and time with between workflow caching (see docs) wrapper: "master/bio/reference/ensembl-sequence" From af777d4a88e0d15741f36bbdafd5de48c236e96b Mon Sep 17 00:00:00 2001 From: Mauricio Rodriguez G Date: Tue, 11 Jun 2024 17:50:09 +0800 Subject: [PATCH 08/12] Update bio/reference/ensembl-sequence/test/Snakefile Co-authored-by: Filipe G. Vieira <1151762+fgvieira@users.noreply.github.com> --- bio/reference/ensembl-sequence/test/Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bio/reference/ensembl-sequence/test/Snakefile b/bio/reference/ensembl-sequence/test/Snakefile index 9e25a003c8d..13e238efa6c 100644 --- a/bio/reference/ensembl-sequence/test/Snakefile +++ b/bio/reference/ensembl-sequence/test/Snakefile @@ -23,7 +23,7 @@ rule get_single_chromosome: release="101", chromosome=["II"], # optional: restrict to one or multiple chromosomes, for multiple see below log: - "logs/get_genome.log", + "logs/get_single_genome.log", cache: "omit-software" # save space and time with between workflow caching (see docs) wrapper: "master/bio/reference/ensembl-sequence" From 7d90c4294bf32dda14c12dab692b4e7da4e1682c Mon Sep 17 00:00:00 2001 From: Mauricio Rodriguez G Date: Tue, 11 Jun 2024 17:50:19 +0800 Subject: [PATCH 09/12] Update bio/reference/ensembl-sequence/test/Snakefile Co-authored-by: Filipe G. Vieira <1151762+fgvieira@users.noreply.github.com> --- bio/reference/ensembl-sequence/test/Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bio/reference/ensembl-sequence/test/Snakefile b/bio/reference/ensembl-sequence/test/Snakefile index 13e238efa6c..aa4961c5027 100644 --- a/bio/reference/ensembl-sequence/test/Snakefile +++ b/bio/reference/ensembl-sequence/test/Snakefile @@ -38,7 +38,7 @@ rule get_multiple_chromosome: release="101", chromosome=["I", "II"], # optional: restrict to one or multiple chromosomes log: - "logs/get_genome.log", + "logs/get_multiple_chromosome.log", cache: "omit-software" # save space and time with between workflow caching (see docs) wrapper: "master/bio/reference/ensembl-sequence" From 79af81d3990c76be0a75802d00dff164d043aec6 Mon Sep 17 00:00:00 2001 From: Mauricio Rodriguez G Date: Tue, 11 Jun 2024 17:53:13 +0800 Subject: [PATCH 10/12] Update bio/reference/ensembl-annotation/wrapper.py Co-authored-by: Filipe G. Vieira <1151762+fgvieira@users.noreply.github.com> --- bio/reference/ensembl-annotation/wrapper.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bio/reference/ensembl-annotation/wrapper.py b/bio/reference/ensembl-annotation/wrapper.py index 7c8b6207b27..b61f2b67531 100644 --- a/bio/reference/ensembl-annotation/wrapper.py +++ b/bio/reference/ensembl-annotation/wrapper.py @@ -31,9 +31,9 @@ elif snakemake.params.get("branch"): branch = snakemake.params.branch + "/" -collection = "" -if snakemake.params.get("collection"): - collection = snakemake.params.collection + "/" +collection = snakemake.params.get("collection". "") +if collection: + collection = f"{collection}/" flavor = snakemake.params.get("flavor", "") if flavor: From e9bfedf7a5513fd4c6352413fedf59f54bc3a5ab Mon Sep 17 00:00:00 2001 From: Mauricio Rodriguez G Date: Tue, 11 Jun 2024 17:53:22 +0800 Subject: [PATCH 11/12] Update bio/reference/ensembl-sequence/wrapper.py Co-authored-by: Filipe G. Vieira <1151762+fgvieira@users.noreply.github.com> --- bio/reference/ensembl-sequence/wrapper.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bio/reference/ensembl-sequence/wrapper.py b/bio/reference/ensembl-sequence/wrapper.py index 6eb9116235e..af9f3c3041f 100644 --- a/bio/reference/ensembl-sequence/wrapper.py +++ b/bio/reference/ensembl-sequence/wrapper.py @@ -19,9 +19,9 @@ elif snakemake.params.get("branch"): branch = snakemake.params.branch + "/" -collection = "" -if snakemake.params.get("collection"): - collection = snakemake.params.collection + "/" +collection = snakemake.params.get("collection". "") +if collection: + collection = f"{collection}/" log = snakemake.log_fmt_shell(stdout=False, stderr=True) From 47ae26f32ab773ea32a181a2e01972b6861c6175 Mon Sep 17 00:00:00 2001 From: Mauricio Rodriguez G Date: Wed, 12 Jun 2024 09:00:36 +0800 Subject: [PATCH 12/12] Update bio/reference/ensembl-sequence/wrapper.py Co-authored-by: Filipe G. Vieira <1151762+fgvieira@users.noreply.github.com> --- bio/reference/ensembl-sequence/wrapper.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/bio/reference/ensembl-sequence/wrapper.py b/bio/reference/ensembl-sequence/wrapper.py index af9f3c3041f..874c9219100 100644 --- a/bio/reference/ensembl-sequence/wrapper.py +++ b/bio/reference/ensembl-sequence/wrapper.py @@ -26,13 +26,9 @@ log = snakemake.log_fmt_shell(stdout=False, stderr=True) if branch == "" or branch == "grch37/": - spec = ("{build}" if int(release) > 75 else "{build}.{release}").format( - build=build, release=release - ) + spec = f"{build}" if int(release) > 75 else f"{build}.{release}" else: - spec = ("{build}" if int(release) > 30 else "{build}.{release}").format( - build=build, release=release - ) + spec = f"{build}" if int(release) > 30 else f"{build}.{release}" suffixes = "" datatype = snakemake.params.get("datatype", "")