From ce90d5380c88f31f284839b2509f6ac16683ac0a Mon Sep 17 00:00:00 2001 From: aawdeh Date: Wed, 16 Oct 2024 14:49:56 -0400 Subject: [PATCH 01/38] add peak calling --- pipelines/skylab/atac/atac.wdl | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 8918a8d8ad..2516b8088b 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -537,6 +537,7 @@ task CreateFragmentFile { atac_gtf = "~{annotations_gtf}" preindex = "~{preindex}" atac_nhash_id = "~{atac_nhash_id}" + peakcalling_bool = true # calculate chrom size dictionary based on text file chrom_size_dict={} @@ -553,10 +554,10 @@ task CreateFragmentFile { import csv # extract CB or BB (if preindex is true) tag from bam file to create fragment file - if preindex == "true": - data = pp.recipe_10x_metrics("~{bam}", "~{bam_base_name}.fragments.tsv", "temp_metrics.h5ad", is_paired=True, barcode_tag="BB", chrom_sizes=chrom_size_dict, gene_anno=atac_gtf, peaks=None) - elif preindex == "false": - data = pp.recipe_10x_metrics("~{bam}", "~{bam_base_name}.fragments.tsv", "temp_metrics.h5ad", is_paired=True, barcode_tag="CB", chrom_sizes=chrom_size_dict, gene_anno=atac_gtf, peaks=None) + if preindex: + data = pp.recipe_10x_metrics("~{bam}", "~{bam_base_name}.fragments.tsv", "temp_metrics.h5ad", is_paired=True, barcode_tag="BB", chrom_sizes=chrom_size_dict, gene_anno=atac_gtf, peaks=None) + else: + data = pp.recipe_10x_metrics("~{bam}", "~{bam_base_name}.fragments.tsv", "temp_metrics.h5ad", is_paired=True, barcode_tag="CB", chrom_sizes=chrom_size_dict, gene_anno=atac_gtf, peaks=None) # Add NHashID to metrics nhash_ID_value = "XXX" @@ -575,7 +576,6 @@ task CreateFragmentFile { with open(csv_file_path, mode='w', newline='') as file: writer = csv.writer(file) writer.writerows(flattened_data) # Write data - print(f"Dictionary successfully written to {csv_file_path}") atac_data = ad.read_h5ad("temp_metrics.h5ad") @@ -583,9 +583,23 @@ task CreateFragmentFile { atac_data.uns['NHashID'] = atac_nhash_id # calculate tsse metrics snap.metrics.tsse(atac_data, atac_gtf) - # Write new atac file atac_data.write_h5ad("~{bam_base_name}.metrics.h5ad") + # peak calling? https://kzhang.org/SnapATAC2/tutorials/diff.html#Peak-calling-at-the-cluster-level + if peakcalling_bool: + print("Peak calling using MACS3") + snap.tl.macs3(atac_data) # were not including cell type? + print("test") + print(data.uns['macs3']) + + print("Writing peak files") + for k, peaks in data.uns['macs3'].items(): + peaks.to_csv(f'{k}.NarrowPeak', sep='\t', header=False, index=False) + ls + + # Write atac file + print("Writing h5ad file") + atac_data.write_h5ad("~{bam_base_name}.peaks.h5ad") CODE >>> From 82c5c454daed51a1a8c748b61b78c54b3201cea1 Mon Sep 17 00:00:00 2001 From: aawdeh Date: Wed, 16 Oct 2024 15:32:09 -0400 Subject: [PATCH 02/38] added output for peaks --- pipelines/skylab/atac/atac.wdl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 2516b8088b..61f38361d6 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -598,7 +598,7 @@ task CreateFragmentFile { ls # Write atac file - print("Writing h5ad file") + print("Writing h5ad file with peaks") atac_data.write_h5ad("~{bam_base_name}.peaks.h5ad") CODE >>> @@ -614,6 +614,7 @@ task CreateFragmentFile { output { File fragment_file = "~{bam_base_name}.fragments.tsv" File Snap_metrics = "~{bam_base_name}.metrics.h5ad" + File peaks_h5 = "~{bam_base_name}.peaks.h5ad" # test File atac_library_metrics = "~{bam_base_name}_~{atac_nhash_id}.atac_metrics.csv" } } From 6e833d2f3cf02a3ad7cf865651679aee0c8f914a Mon Sep 17 00:00:00 2001 From: aawdeh Date: Wed, 16 Oct 2024 16:59:52 -0400 Subject: [PATCH 03/38] fix --- pipelines/skylab/atac/atac.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 61f38361d6..c547c0ce8e 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -537,7 +537,7 @@ task CreateFragmentFile { atac_gtf = "~{annotations_gtf}" preindex = "~{preindex}" atac_nhash_id = "~{atac_nhash_id}" - peakcalling_bool = true + peakcalling_bool = True # calculate chrom size dictionary based on text file chrom_size_dict={} @@ -554,7 +554,7 @@ task CreateFragmentFile { import csv # extract CB or BB (if preindex is true) tag from bam file to create fragment file - if preindex: + if preindex == "true": data = pp.recipe_10x_metrics("~{bam}", "~{bam_base_name}.fragments.tsv", "temp_metrics.h5ad", is_paired=True, barcode_tag="BB", chrom_sizes=chrom_size_dict, gene_anno=atac_gtf, peaks=None) else: data = pp.recipe_10x_metrics("~{bam}", "~{bam_base_name}.fragments.tsv", "temp_metrics.h5ad", is_paired=True, barcode_tag="CB", chrom_sizes=chrom_size_dict, gene_anno=atac_gtf, peaks=None) From 76286b482f916296c02f21e6c703b85be8da2c6b Mon Sep 17 00:00:00 2001 From: aawdeh Date: Wed, 16 Oct 2024 17:45:15 -0400 Subject: [PATCH 04/38] fix --- pipelines/skylab/atac/atac.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index c547c0ce8e..15e938c3e5 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -590,10 +590,10 @@ task CreateFragmentFile { print("Peak calling using MACS3") snap.tl.macs3(atac_data) # were not including cell type? print("test") - print(data.uns['macs3']) + print(atac_data.uns['macs3']) print("Writing peak files") - for k, peaks in data.uns['macs3'].items(): + for k, peaks in atac_data.uns['macs3'].items(): peaks.to_csv(f'{k}.NarrowPeak', sep='\t', header=False, index=False) ls From 90805ca0736cdce0c3c0e0b16c565a89ceaea06d Mon Sep 17 00:00:00 2001 From: aawdeh Date: Wed, 16 Oct 2024 18:33:48 -0400 Subject: [PATCH 05/38] test --- pipelines/skylab/atac/atac.wdl | 5 ----- 1 file changed, 5 deletions(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 15e938c3e5..6073ac0cfb 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -590,11 +590,6 @@ task CreateFragmentFile { print("Peak calling using MACS3") snap.tl.macs3(atac_data) # were not including cell type? print("test") - print(atac_data.uns['macs3']) - - print("Writing peak files") - for k, peaks in atac_data.uns['macs3'].items(): - peaks.to_csv(f'{k}.NarrowPeak', sep='\t', header=False, index=False) ls # Write atac file From b27011c80bd5cea6cb3d79e03b68e2494754fb52 Mon Sep 17 00:00:00 2001 From: aawdeh Date: Thu, 17 Oct 2024 11:27:40 -0400 Subject: [PATCH 06/38] remove ls --- pipelines/skylab/atac/atac.wdl | 1 - 1 file changed, 1 deletion(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 6073ac0cfb..2326d27b64 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -590,7 +590,6 @@ task CreateFragmentFile { print("Peak calling using MACS3") snap.tl.macs3(atac_data) # were not including cell type? print("test") - ls # Write atac file print("Writing h5ad file with peaks") From c1f0e5709aedf2a4460695619575fb4e45a599ba Mon Sep 17 00:00:00 2001 From: aawdeh Date: Fri, 18 Oct 2024 09:41:05 -0400 Subject: [PATCH 07/38] test --- pipelines/skylab/atac/atac.wdl | 51 ++++++++++++++++++++++++++++++++-- 1 file changed, 49 insertions(+), 2 deletions(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 2326d27b64..e354ca13a8 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -550,6 +550,7 @@ task CreateFragmentFile { import snapatac2.preprocessing as pp import snapatac2 as snap import anndata as ad + import scanpy as sc from collections import OrderedDict import csv @@ -585,10 +586,56 @@ task CreateFragmentFile { snap.metrics.tsse(atac_data, atac_gtf) atac_data.write_h5ad("~{bam_base_name}.metrics.h5ad") - # peak calling? https://kzhang.org/SnapATAC2/tutorials/diff.html#Peak-calling-at-the-cluster-level + # Peak calling if peakcalling_bool: + print("Peak calling") + # Calculate and plot the size distribution of fragments + print("1.1 Calculating fragment size distribution") + snap.pl.frag_size_distr(atac_data) + # Need to parameterize + # Filter cells + print("1.2 Filtering cells") + snap.pp.filter_cells(atac_data, min_counts=5000, min_tsse=10, max_counts=100000) + print(atac_data) + # Create a cell by bin matrix containing insertion counts across genome-wide 500-bp bins. + print("1.3 Creating cell by bin matrix") + snap.pp.add_tile_matrix(atac_data) + print(atac_data) + # Feature selection + print("1.4 Feature selection") + snap.pp.select_features(atac_data) + # Run customized scrublet algorithm to identify potential doublets + print("1.5 Run scrublet to identify potential doublets") + snap.pp.scrublet(atac_data) + # Employ spectral embedding for dimensionality reduction + print("1.6 Employ spectral embedding for dimensionality reduction") + snap.tl.spectral(atac_data) + # Filter doublets based on scrublet scores + print("1.7 Filter doublets based on scrublet scores") + snap.pp.filter_doublets(atac_data, probability_threshold=0.5) + # Perform graph-based clustering to identify cell clusters. + # Build a k-nearest neighbour graph using snap.pp.knn + print("1.8 Perform knn graph-based clustering to identify cell clusters") + snap.pp.knn(atac_data) + # Use the Leiden community detection algorithm to identify densely-connected subgraphs/clusters in the graph + print("1.9 Use the Leiden community detection algorithm to identify densely-connected subgraphs/clusters in the graph") + snap.tl.leiden(atac_data) + # Create the cell by gene activity matrix + print("1.10 Create the cell by gene activity matrix") + gene_mat = snap.pp.make_gene_matrix(atac_data, gene_anno=atac_gtf) + # Normalize the gene matrix + print("1.11 Normalize the gene matrix") + gene_mat.obs['leiden'] = atac_data.obs['leiden'] + sc.pp.normalize_total(gene_mat) + sc.pp.log1p(gene_mat) + sc.tl.rank_genes_groups(gene_mat, groupby="leiden", method="wilcoxon") + + for i in np.unique(gene_mat.obs['leiden']): + markers = sc.get.rank_genes_groups_df(gene_mat, group=i).head(7)['names'] + print(f"Cluster {i}: {', '.join(markers)}") + print("Peak calling using MACS3") - snap.tl.macs3(atac_data) # were not including cell type? + snap.tl.macs3(atac_data, groupby='leiden') print("test") # Write atac file From 779fe9bd27d4cb0cb8efad0e2fc9842f9685c765 Mon Sep 17 00:00:00 2001 From: aawdeh Date: Fri, 18 Oct 2024 13:23:45 -0400 Subject: [PATCH 08/38] test --- pipelines/skylab/atac/atac.wdl | 71 ++++++++++++++++++++++------------ 1 file changed, 46 insertions(+), 25 deletions(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index e354ca13a8..87b2cf9e83 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -588,44 +588,63 @@ task CreateFragmentFile { # Peak calling if peakcalling_bool: - print("Peak calling") + print("Peak calling starting...") + # Calculate and plot the size distribution of fragments - print("1.1 Calculating fragment size distribution") + print("Calculating fragment size distribution") snap.pl.frag_size_distr(atac_data) - # Need to parameterize - # Filter cells - print("1.2 Filtering cells") + print(atac_data) + + # Filter cells -- Need to parameterize + print("Filtering cells") snap.pp.filter_cells(atac_data, min_counts=5000, min_tsse=10, max_counts=100000) print(atac_data) + # Create a cell by bin matrix containing insertion counts across genome-wide 500-bp bins. - print("1.3 Creating cell by bin matrix") - snap.pp.add_tile_matrix(atac_data) - print(atac_data) + print("Creating cell by bin matrix") + atac_data_mod = snap.pp.add_tile_matrix(atac_data) + atac_data_mod.obsm = atac_data.obsm + print(atac_data_mod) + # Feature selection - print("1.4 Feature selection") - snap.pp.select_features(atac_data) + print("Feature selection") + snap.pp.select_features(atac_data_mod) + print(atac_data_mod) + # Run customized scrublet algorithm to identify potential doublets - print("1.5 Run scrublet to identify potential doublets") - snap.pp.scrublet(atac_data) + print("Run scrublet to identify potential doublets") + snap.pp.scrublet(atac_data_mod) + print(atac_data_mod) + # Employ spectral embedding for dimensionality reduction - print("1.6 Employ spectral embedding for dimensionality reduction") - snap.tl.spectral(atac_data) + print("Employ spectral embedding for dimensionality reduction") + snap.tl.spectral(atac_data_mod) + print(atac_data_mod) + # Filter doublets based on scrublet scores - print("1.7 Filter doublets based on scrublet scores") - snap.pp.filter_doublets(atac_data, probability_threshold=0.5) + print("Filter doublets based on scrublet scores") + snap.pp.filter_doublets(atac_data_mod, probability_threshold=0.5) + print(atac_data_mod) + # Perform graph-based clustering to identify cell clusters. # Build a k-nearest neighbour graph using snap.pp.knn - print("1.8 Perform knn graph-based clustering to identify cell clusters") - snap.pp.knn(atac_data) + print("Perform knn graph-based clustering to identify cell clusters") + snap.pp.knn(atac_data_mod) + print(atac_data_mod) + # Use the Leiden community detection algorithm to identify densely-connected subgraphs/clusters in the graph - print("1.9 Use the Leiden community detection algorithm to identify densely-connected subgraphs/clusters in the graph") - snap.tl.leiden(atac_data) + print("Use the Leiden community detection algorithm to identify densely-connected subgraphs/clusters in the graph") + snap.tl.leiden(atac_data_mod) + print(atac_data_mod) + # Create the cell by gene activity matrix - print("1.10 Create the cell by gene activity matrix") - gene_mat = snap.pp.make_gene_matrix(atac_data, gene_anno=atac_gtf) + print("Create the cell by gene activity matrix") + gene_mat = snap.pp.make_gene_matrix(atac_data_mod, gene_anno=atac_gtf) + print(atac_data_mod) + # Normalize the gene matrix - print("1.11 Normalize the gene matrix") - gene_mat.obs['leiden'] = atac_data.obs['leiden'] + print("Normalize the gene matrix") + gene_mat.obs['leiden'] = atac_data_mod.obs['leiden'] sc.pp.normalize_total(gene_mat) sc.pp.log1p(gene_mat) sc.tl.rank_genes_groups(gene_mat, groupby="leiden", method="wilcoxon") @@ -635,7 +654,9 @@ task CreateFragmentFile { print(f"Cluster {i}: {', '.join(markers)}") print("Peak calling using MACS3") - snap.tl.macs3(atac_data, groupby='leiden') + snap.tl.macs3(atac_data_mod, groupby='leiden') + + print(atac_data_mod.uns['macs3']) print("test") # Write atac file From 5743eff9753a3ccb183730ea275c59003aa4c89c Mon Sep 17 00:00:00 2001 From: aawdeh Date: Fri, 18 Oct 2024 13:38:54 -0400 Subject: [PATCH 09/38] test --- pipelines/skylab/atac/atac.wdl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 87b2cf9e83..1d3a03d2fe 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -527,7 +527,8 @@ task CreateFragmentFile { command <<< set -e pipefail - + pip3 install snapatac2==2.7.0 scanpy + python3 < Date: Fri, 18 Oct 2024 14:09:06 -0400 Subject: [PATCH 10/38] test --- pipelines/skylab/atac/atac.wdl | 1 + 1 file changed, 1 insertion(+) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 1d3a03d2fe..ab32f30397 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -605,6 +605,7 @@ task CreateFragmentFile { print("Creating cell by bin matrix") atac_data_mod = snap.pp.add_tile_matrix(atac_data) atac_data_mod.obsm = atac_data.obsm + new_adata.uns["reference_sequences"] = atac_data.uns["reference_sequences"] print(atac_data_mod) # Feature selection From 1de1c85103a0edb23d234bce4adc714f7d000bc2 Mon Sep 17 00:00:00 2001 From: aawdeh Date: Fri, 18 Oct 2024 15:38:37 -0400 Subject: [PATCH 11/38] test --- pipelines/skylab/atac/atac.wdl | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index ab32f30397..d7d2e5b34f 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -526,7 +526,9 @@ task CreateFragmentFile { } command <<< - set -e pipefail + set -eou pipefail + set -x + # install packages -- need to add to the docker pip3 install snapatac2==2.7.0 scanpy python3 < Date: Fri, 18 Oct 2024 15:59:36 -0400 Subject: [PATCH 12/38] test --- pipelines/skylab/atac/atac.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index d7d2e5b34f..90953769fa 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -540,7 +540,7 @@ task CreateFragmentFile { atac_gtf = "~{annotations_gtf}" preindex = "~{preindex}" atac_nhash_id = "~{atac_nhash_id}" - peakcalling_bool = False + peakcalling_bool = True # calculate chrom size dictionary based on text file chrom_size_dict={} From 13a44f9ff6eef6356d34a528f2aeb88f7a348581 Mon Sep 17 00:00:00 2001 From: aawdeh Date: Mon, 21 Oct 2024 10:11:17 -0400 Subject: [PATCH 13/38] test --- pipelines/skylab/atac/atac.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 90953769fa..ae563eb49a 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -607,7 +607,7 @@ task CreateFragmentFile { print("Creating cell by bin matrix") atac_data_mod = snap.pp.add_tile_matrix(atac_data) print("set obsm") - atac_data_mod.obsm = atac_data.obsm + atac_data_mod.obsm["fragment_paired"] = atac_data.obsm["fragment_paired"] print("set uns") new_adata.uns["reference_sequences"] = atac_data.uns["reference_sequences"] print(atac_data_mod) From 3f12b9e3dc84031a401132792d6c7e6ff9549c96 Mon Sep 17 00:00:00 2001 From: aawdeh Date: Mon, 21 Oct 2024 16:38:31 -0400 Subject: [PATCH 14/38] made another task --- pipelines/skylab/atac/atac.wdl | 198 +++++++++++++++++++-------------- 1 file changed, 113 insertions(+), 85 deletions(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index ae563eb49a..db4b94de64 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -155,6 +155,13 @@ workflow ATAC { } } + call PeakCalling { + input: + bam = BWAPairedEndAlignment.bam_aligned_output, + annotations_gtf = annotations_gtf, + metrics_h5ad = CreateFragmentFile.Snap_metrics + } + File bam_aligned_output_atac = select_first([BBTag.bb_bam, BWAPairedEndAlignment.bam_aligned_output]) File fragment_file_atac = select_first([BB_fragment.fragment_file, CreateFragmentFile.fragment_file]) File snap_metrics_atac = select_first([BB_fragment.Snap_metrics,CreateFragmentFile.Snap_metrics]) @@ -504,7 +511,6 @@ task CreateFragmentFile { File bam File annotations_gtf File chrom_sizes - File annotations_gtf Boolean preindex Int disk_size = 500 Int mem_size = 64 @@ -528,9 +534,7 @@ task CreateFragmentFile { command <<< set -eou pipefail set -x - # install packages -- need to add to the docker - pip3 install snapatac2==2.7.0 scanpy - + python3 <>> - # Peak calling - if peakcalling_bool: - print("Peak calling starting...") + runtime { + docker: docker_path + disks: "local-disk ${disk_size} SSD" + memory: "${mem_size} GiB" + cpu: nthreads + cpuPlatform: cpuPlatform + } + + output { + File fragment_file = "~{bam_base_name}.fragments.tsv" + File Snap_metrics = "~{bam_base_name}.metrics.h5ad" + File atac_library_metrics = "~{bam_base_name}_~{atac_nhash_id}.atac_metrics.csv" + } +} - # Calculate and plot the size distribution of fragments - print("Calculating fragment size distribution") - snap.pl.frag_size_distr(atac_data) - print(atac_data) - # Filter cells -- Need to parameterize - print("Filtering cells") - snap.pp.filter_cells(atac_data, min_counts=5000, min_tsse=10, max_counts=100000) - print(atac_data) +task PeakCalling { + input { + File bam + File annotations_gtf + File metrics_h5ad + } + String bam_base_name = basename(bam, ".bam") + + command <<< + set -euo pipefail + set -x + + # install packages -- need to add to the docker + pip3 install snapatac2==2.7.0 scanpy + + python3 <>> - - runtime { - docker: docker_path - disks: "local-disk ${disk_size} SSD" - memory: "${mem_size} GiB" - cpu: nthreads - cpuPlatform: cpuPlatform - } - + output { - File fragment_file = "~{bam_base_name}.fragments.tsv" - File Snap_metrics = "~{bam_base_name}.metrics.h5ad" - File peaks_h5 = "~{bam_base_name}.peaks.h5ad" # test - File atac_library_metrics = "~{bam_base_name}_~{atac_nhash_id}.atac_metrics.csv" + File peaks_h5ad = "~{bam_base_name}.peaks.h5ad" } -} + +} \ No newline at end of file From 08b2f2410dca33918a15456bd4963959cbbc6170 Mon Sep 17 00:00:00 2001 From: aawdeh Date: Mon, 21 Oct 2024 17:04:27 -0400 Subject: [PATCH 15/38] test --- pipelines/skylab/atac/atac.wdl | 38 +++++++++++++++++++++++++++------- 1 file changed, 31 insertions(+), 7 deletions(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index db4b94de64..1a8a5533bd 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -153,14 +153,14 @@ workflow ATAC { atac_nhash_id = atac_nhash_id } - } - - call PeakCalling { + call PeakCalling { input: bam = BWAPairedEndAlignment.bam_aligned_output, annotations_gtf = annotations_gtf, - metrics_h5ad = CreateFragmentFile.Snap_metrics + metrics_h5ad = CreateFragmentFile.Snap_metrics, + docker_path = docker_prefix + snap_atac_docker } + } File bam_aligned_output_atac = select_first([BBTag.bb_bam, BWAPairedEndAlignment.bam_aligned_output]) File fragment_file_atac = select_first([BB_fragment.fragment_file, CreateFragmentFile.fragment_file]) @@ -267,7 +267,6 @@ task GetNumSplits { } } - # trim read 1 and read 2 adapter sequeunce with cutadapt task TrimAdapters { input { @@ -615,8 +614,21 @@ task PeakCalling { File bam File annotations_gtf File metrics_h5ad + # copied from previous task + Int disk_size = 500 + Int mem_size = 64 + Int nthreads = 4 + String docker_path } String bam_base_name = basename(bam, ".bam") + + parameter_meta { + bam: "Aligned bam with CB in CB tag. This is the output of the BWAPairedEndAlignment task." + annotations_gtf: "GTF for SnapATAC2 to calculate TSS sites of fragment file." + disk_size: "Disk size used in create fragment file step." + mem_size: "The size of memory used in create fragment file." + docker_path: "The docker image path containing the runtime environment for this task" + } command <<< set -euo pipefail @@ -627,12 +639,17 @@ task PeakCalling { python3 <>> + runtime { + docker: docker_path + disks: "local-disk ${disk_size} SSD" + memory: "${mem_size} GiB" + cpu: nthreads + } + output { File peaks_h5ad = "~{bam_base_name}.peaks.h5ad" } From 80c1416b5debd07775a8fefd0b5f0d3038a2bf6c Mon Sep 17 00:00:00 2001 From: aawdeh Date: Mon, 21 Oct 2024 17:19:26 -0400 Subject: [PATCH 16/38] test --- pipelines/skylab/atac/atac.wdl | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 1a8a5533bd..d0ef3ee92e 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -510,6 +510,7 @@ task CreateFragmentFile { File bam File annotations_gtf File chrom_sizes + File annotations_gtf Boolean preindex Int disk_size = 500 Int mem_size = 64 @@ -531,8 +532,7 @@ task CreateFragmentFile { } command <<< - set -eou pipefail - set -x + set -e pipefail python3 <>> @@ -608,7 +609,6 @@ task CreateFragmentFile { } } - task PeakCalling { input { File bam @@ -666,8 +666,10 @@ task PeakCalling { atac_data_mod = snap.pp.add_tile_matrix(atac_data) print("set obsm") atac_data_mod.obsm["fragment_paired"] = atac_data.obsm["fragment_paired"] - print("set uns") - new_adata.uns["reference_sequences"] = atac_data.uns["reference_sequences"] + print("set all uns") + for key in atac_data.uns.keys(): + print("set ",key) + new_adata.uns[key] = atac_data.uns[key] print(atac_data_mod) # Feature selection From b5191e526d28b1b8a83dc89c7a792b62d089351e Mon Sep 17 00:00:00 2001 From: aawdeh Date: Mon, 21 Oct 2024 17:44:49 -0400 Subject: [PATCH 17/38] test --- pipelines/skylab/atac/atac.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index d0ef3ee92e..6a39a3472d 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -663,13 +663,13 @@ task PeakCalling { # Create a cell by bin matrix containing insertion counts across genome-wide 500-bp bins. print("Creating cell by bin matrix") - atac_data_mod = snap.pp.add_tile_matrix(atac_data) + atac_data_mod = snap.pp.add_tile_matrix(atac_data, inplace=False) print("set obsm") atac_data_mod.obsm["fragment_paired"] = atac_data.obsm["fragment_paired"] print("set all uns") for key in atac_data.uns.keys(): print("set ",key) - new_adata.uns[key] = atac_data.uns[key] + atac_data_mod.uns[key] = atac_data.uns[key] print(atac_data_mod) # Feature selection From 4725080516d8cffaa47382c659c3989bbb72775e Mon Sep 17 00:00:00 2001 From: aawdeh Date: Mon, 21 Oct 2024 18:49:20 -0400 Subject: [PATCH 18/38] test --- pipelines/skylab/atac/atac.wdl | 1 + 1 file changed, 1 insertion(+) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 6a39a3472d..28d746600d 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -642,6 +642,7 @@ task PeakCalling { # use snap atac2 import snapatac2 as snap import scanpy as sc + import numpy as np bam = "~{bam}" bam_base_name = "~{bam_base_name}" From f139f84743b66aefe5cab0a0c16d7e6838ed7179 Mon Sep 17 00:00:00 2001 From: aawdeh Date: Tue, 22 Oct 2024 08:51:50 -0400 Subject: [PATCH 19/38] test --- pipelines/skylab/atac/atac.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 28d746600d..8af59d2a3f 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -721,7 +721,7 @@ task PeakCalling { print(f"Cluster {i}: {', '.join(markers)}") print("Peak calling using MACS3") - snap.tl.macs3(atac_data_mod, groupby='leiden') + snap.tl.macs3(atac_data_mod, groupby='leiden', n_jobs=1) atac_data_mod.write_h5ad("~{bam_base_name}.peaks.h5ad") print("test") From 9a0ea9b47e5de7dec7d968a001894247351d225e Mon Sep 17 00:00:00 2001 From: aawdeh Date: Tue, 22 Oct 2024 10:08:42 -0400 Subject: [PATCH 20/38] test --- pipelines/skylab/atac/atac.wdl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 8af59d2a3f..ccb474acc2 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -720,8 +720,9 @@ task PeakCalling { markers = sc.get.rank_genes_groups_df(gene_mat, group=i).head(7)['names'] print(f"Cluster {i}: {', '.join(markers)}") - print("Peak calling using MACS3") - snap.tl.macs3(atac_data_mod, groupby='leiden', n_jobs=1) + if __name__ == '__main__': + print("Peak calling using MACS3") + snap.tl.macs3(atac_data_mod, groupby='leiden', n_jobs=1) atac_data_mod.write_h5ad("~{bam_base_name}.peaks.h5ad") print("test") From 04cf0d2a91df97d737a70ec562fe61cc1301fe15 Mon Sep 17 00:00:00 2001 From: aawdeh Date: Tue, 22 Oct 2024 10:28:26 -0400 Subject: [PATCH 21/38] test --- pipelines/skylab/atac/atac.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index ccb474acc2..5227c534ff 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -722,7 +722,7 @@ task PeakCalling { if __name__ == '__main__': print("Peak calling using MACS3") - snap.tl.macs3(atac_data_mod, groupby='leiden', n_jobs=1) + snap.tl.macs3(atac_data_mod, groupby='leiden', n_jobs=8) atac_data_mod.write_h5ad("~{bam_base_name}.peaks.h5ad") print("test") From f9bc8718b488bc13d1e20028fe56a2180d81013b Mon Sep 17 00:00:00 2001 From: aawdeh Date: Thu, 24 Oct 2024 12:58:18 -0400 Subject: [PATCH 22/38] change a line --- pipelines/skylab/atac/atac.wdl | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 5227c534ff..9576c42602 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -637,6 +637,9 @@ task PeakCalling { # install packages -- need to add to the docker pip3 install snapatac2==2.7.0 scanpy + file_to_change=/usr/local/lib/python3.10/site-packages/snapatac2/tools/_call_peaks.py + sed -i '164s/.*/\tpeaks = [_call_peaks(x) for x in fragments.values()]/' $file_to_change + python3 < Date: Thu, 24 Oct 2024 14:44:52 -0400 Subject: [PATCH 23/38] test --- pipelines/skylab/atac/atac.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 9576c42602..5c57a7c6ba 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -638,7 +638,7 @@ task PeakCalling { pip3 install snapatac2==2.7.0 scanpy file_to_change=/usr/local/lib/python3.10/site-packages/snapatac2/tools/_call_peaks.py - sed -i '164s/.*/\tpeaks = [_call_peaks(x) for x in fragments.values()]/' $file_to_change + sed -i '164s/.*/peaks = [_call_peaks(x) for x in fragments.values()]/' $file_to_change python3 < Date: Thu, 24 Oct 2024 15:10:32 -0400 Subject: [PATCH 24/38] test --- pipelines/skylab/atac/atac.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 5c57a7c6ba..60bd780f22 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -638,7 +638,7 @@ task PeakCalling { pip3 install snapatac2==2.7.0 scanpy file_to_change=/usr/local/lib/python3.10/site-packages/snapatac2/tools/_call_peaks.py - sed -i '164s/.*/peaks = [_call_peaks(x) for x in fragments.values()]/' $file_to_change + sed -i '164s/.*/\t\t\t\tpeaks = [_call_peaks(x) for x in fragments.values()]/' $file_to_change python3 < Date: Thu, 24 Oct 2024 16:00:38 -0400 Subject: [PATCH 25/38] test --- pipelines/skylab/atac/atac.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 60bd780f22..563796f72f 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -638,7 +638,7 @@ task PeakCalling { pip3 install snapatac2==2.7.0 scanpy file_to_change=/usr/local/lib/python3.10/site-packages/snapatac2/tools/_call_peaks.py - sed -i '164s/.*/\t\t\t\tpeaks = [_call_peaks(x) for x in fragments.values()]/' $file_to_change + sed -i '164s/.*/ peaks = [_call_peaks(x) for x in fragments.values()]/' $file_to_change python3 < Date: Thu, 24 Oct 2024 18:50:22 -0400 Subject: [PATCH 26/38] test --- pipelines/skylab/atac/atac.wdl | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 563796f72f..e11f6853c2 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -646,6 +646,7 @@ task PeakCalling { import snapatac2 as snap import scanpy as sc import numpy as np + import polars as pl bam = "~{bam}" bam_base_name = "~{bam_base_name}" @@ -723,11 +724,29 @@ task PeakCalling { markers = sc.get.rank_genes_groups_df(gene_mat, group=i).head(7)['names'] print(f"Cluster {i}: {', '.join(markers)}") - if __name__ == '__main__': - print("Peak calling using MACS3") - snap.tl.macs3(atac_data_mod, groupby='leiden', n_jobs=8) - - atac_data_mod.write_h5ad("~{bam_base_name}.peaks.h5ad") + + print("Peak calling using MACS3") + snap.tl.macs3(atac_data_mod, groupby='leiden', n_jobs=8) + + print("NarrowPeak") + for k, peaks in atac_data_mod.uns['macs3'].items(): + peaks.to_csv(f'{k}.NarrowPeak', sep='\t', header=False, index=False) + + print("test") + snap.ex.export_coverage( + atac_data_mod, + groupby='leiden', + ) + + print("Convert pl.DataFrame to pandas DataFrame") + # Convert pl.DataFrame to pandas DataFrame + for key in new_adata.uns.keys(): + if isinstance(new_adata.uns[key], pl.DataFrame): + print(key) + new_adata.uns[key] = new_adata.uns[key].to_pandas() + + print("Write into h5ad file") + atac_data_mod.write_h5ad("~{bam_base_name}.peaks.h5ad", compression='gzip') print("test") CODE From b15b146281e944f4a809ac364870a84fc387b458 Mon Sep 17 00:00:00 2001 From: aawdeh Date: Thu, 24 Oct 2024 18:50:52 -0400 Subject: [PATCH 27/38] test --- pipelines/skylab/atac/atac.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index e11f6853c2..f645309639 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -746,7 +746,7 @@ task PeakCalling { new_adata.uns[key] = new_adata.uns[key].to_pandas() print("Write into h5ad file") - atac_data_mod.write_h5ad("~{bam_base_name}.peaks.h5ad", compression='gzip') + atac_data_mod.write_h5ad("~{bam_base_name}.peaks.h5ad") print("test") CODE From ff36c155c5b010c117ed3d34c9574344f54e0fe2 Mon Sep 17 00:00:00 2001 From: aawdeh Date: Thu, 24 Oct 2024 18:51:29 -0400 Subject: [PATCH 28/38] test --- pipelines/skylab/atac/atac.wdl | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index f645309639..dc41406540 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -724,20 +724,9 @@ task PeakCalling { markers = sc.get.rank_genes_groups_df(gene_mat, group=i).head(7)['names'] print(f"Cluster {i}: {', '.join(markers)}") - print("Peak calling using MACS3") snap.tl.macs3(atac_data_mod, groupby='leiden', n_jobs=8) - print("NarrowPeak") - for k, peaks in atac_data_mod.uns['macs3'].items(): - peaks.to_csv(f'{k}.NarrowPeak', sep='\t', header=False, index=False) - - print("test") - snap.ex.export_coverage( - atac_data_mod, - groupby='leiden', - ) - print("Convert pl.DataFrame to pandas DataFrame") # Convert pl.DataFrame to pandas DataFrame for key in new_adata.uns.keys(): From 27a419678613a1a624cdbcab2f7426a0476a9e34 Mon Sep 17 00:00:00 2001 From: aawdeh Date: Thu, 24 Oct 2024 19:49:17 -0400 Subject: [PATCH 29/38] test --- pipelines/skylab/atac/atac.wdl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index dc41406540..41abc21bcd 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -729,10 +729,10 @@ task PeakCalling { print("Convert pl.DataFrame to pandas DataFrame") # Convert pl.DataFrame to pandas DataFrame - for key in new_adata.uns.keys(): - if isinstance(new_adata.uns[key], pl.DataFrame): + for key in atac_data_mod.uns.keys(): + if isinstance(atac_data_mod.uns[key], pl.DataFrame): print(key) - new_adata.uns[key] = new_adata.uns[key].to_pandas() + atac_data_mod.uns[key] = atac_data_mod.uns[key].to_pandas() print("Write into h5ad file") atac_data_mod.write_h5ad("~{bam_base_name}.peaks.h5ad") From 987b5d2d6f0bb8201d13e7fd1936eb992dba03e0 Mon Sep 17 00:00:00 2001 From: aawdeh Date: Mon, 28 Oct 2024 15:02:49 -0400 Subject: [PATCH 30/38] Add peakcall to task --- pipelines/skylab/atac/atac.wdl | 96 +++++++++++++++++++++++++++++++++- 1 file changed, 95 insertions(+), 1 deletion(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 41abc21bcd..6b563244e9 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -532,7 +532,14 @@ task CreateFragmentFile { } command <<< - set -e pipefail + set -euo pipefail + set -x + + # install packages -- need to add to the docker + pip3 install snapatac2==2.7.0 scanpy + + file_to_change=/usr/local/lib/python3.10/site-packages/snapatac2/tools/_call_peaks.py + sed -i '164s/.*/ peaks = [_call_peaks(x) for x in fragments.values()]/' $file_to_change python3 <>> From cce58417212c7d576a7a049cd66579d010bf3cae Mon Sep 17 00:00:00 2001 From: aawdeh Date: Mon, 28 Oct 2024 15:21:15 -0400 Subject: [PATCH 31/38] test --- pipelines/skylab/atac/atac.wdl | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 6b563244e9..265bb5e90e 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -562,6 +562,9 @@ task CreateFragmentFile { # use snap atac2 import snapatac2.preprocessing as pp import snapatac2 as snap + import scanpy as sc + import numpy as np + import polars as pl import anndata as ad from collections import OrderedDict import csv From bf0e9ce33829aafd1166d96fc08c6183effda6da Mon Sep 17 00:00:00 2001 From: aawdeh Date: Mon, 28 Oct 2024 17:57:44 -0400 Subject: [PATCH 32/38] add parameters --- pipelines/skylab/atac/atac.wdl | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 265bb5e90e..974578dbd7 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -710,12 +710,17 @@ task PeakCalling { input { File bam File annotations_gtf - File metrics_h5ad - # copied from previous task + File metrics_h5ad + # SnapATAC2 parameters + Int min_counts = 5000 + Int min_tsse = 10 + Int max_counts = 100000 + Int doublet_probability_threshold = 0.5 + # Runtime attributes/docker + String docker_path Int disk_size = 500 Int mem_size = 64 - Int nthreads = 4 - String docker_path + Int nthreads = 4 } String bam_base_name = basename(bam, ".bam") @@ -749,6 +754,10 @@ task PeakCalling { bam_base_name = "~{bam_base_name}" atac_gtf = "~{annotations_gtf}" metrics_h5ad = "~{metrics_h5ad}" + min_counts = "~{min_counts}" + min_tsse = "~{min_tsse}" + max_counts = "~{max_counts}" + probability_threshold = "~{doublet_probability_threshold}" print("Peak calling starting...") atac_data = snap.read(metrics_h5ad) @@ -760,7 +769,7 @@ task PeakCalling { # Filter cells -- Need to parameterize print("Filtering cells") - snap.pp.filter_cells(atac_data, min_counts=5000, min_tsse=10, max_counts=100000) + snap.pp.filter_cells(atac_data, min_counts=min_counts, min_tsse=min_tsse, max_counts=max_counts) print(atac_data) # Create a cell by bin matrix containing insertion counts across genome-wide 500-bp bins. @@ -791,7 +800,7 @@ task PeakCalling { # Filter doublets based on scrublet scores print("Filter doublets based on scrublet scores") - snap.pp.filter_doublets(atac_data_mod, probability_threshold=0.5) + snap.pp.filter_doublets(atac_data_mod, probability_threshold=probability_threshold) print(atac_data_mod) # Perform graph-based clustering to identify cell clusters. From 18f77d8d0189ba30ec445b88ace4ba528be8882b Mon Sep 17 00:00:00 2001 From: aawdeh Date: Mon, 4 Nov 2024 09:46:07 -0500 Subject: [PATCH 33/38] test --- pipelines/skylab/atac/atac.wdl | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 89646be7ea..8ac108e959 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -715,7 +715,6 @@ task PeakCalling { Int min_counts = 5000 Int min_tsse = 10 Int max_counts = 100000 - Int doublet_probability_threshold = 0.5 # Runtime attributes/docker String docker_path Int disk_size = 500 @@ -757,7 +756,6 @@ task PeakCalling { min_counts = "~{min_counts}" min_tsse = "~{min_tsse}" max_counts = "~{max_counts}" - probability_threshold = "~{doublet_probability_threshold}" print("Peak calling starting...") atac_data = snap.read(metrics_h5ad) @@ -800,7 +798,7 @@ task PeakCalling { # Filter doublets based on scrublet scores print("Filter doublets based on scrublet scores") - snap.pp.filter_doublets(atac_data_mod, probability_threshold=probability_threshold) + snap.pp.filter_doublets(atac_data_mod, probability_threshold=0.5) print(atac_data_mod) # Perform graph-based clustering to identify cell clusters. From db59231f001dfa60306e62f23cc024b1b9fdcbd4 Mon Sep 17 00:00:00 2001 From: Sid Cox Date: Fri, 13 Dec 2024 10:24:41 -0500 Subject: [PATCH 34/38] peak calling --- pipelines/skylab/atac/atac.wdl | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 7e42a65802..87df29afa0 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -763,13 +763,8 @@ task PeakCalling { command <<< set -euo pipefail set -x - - # install packages -- need to add to the docker - pip3 install snapatac2==2.7.0 scanpy file_to_change=/usr/local/lib/python3.10/site-packages/snapatac2/tools/_call_peaks.py - sed -i '164s/.*/ peaks = [_call_peaks(x) for x in fragments.values()]/' $file_to_change - python3 < Date: Fri, 13 Dec 2024 11:00:33 -0500 Subject: [PATCH 35/38] remove bam_base_name from CreateFragments task --- pipelines/skylab/atac/atac.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 3290d22498..0a55334c15 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -625,7 +625,7 @@ task CreateFragmentFile { if peakcalling_bool: print("Peak calling starting...") - atac_data = snap.read("~{bam_base_name}.metrics.h5ad") + atac_data = snap.read("~{input_id}.metrics.h5ad") # Calculate and plot the size distribution of fragments print("Calculating fragment size distribution") @@ -706,7 +706,7 @@ task CreateFragmentFile { atac_data_mod.uns[key] = atac_data_mod.uns[key].to_pandas() print("Write into h5ad file") - atac_data_mod.write_h5ad("~{bam_base_name}.peaks.h5ad") + atac_data_mod.write_h5ad("~{input_id}.peaks.h5ad") print("test") CODE From 987e1e527e5cf1d4de8436aaa4ccdaddb7c79c48 Mon Sep 17 00:00:00 2001 From: Sid Cox Date: Fri, 13 Dec 2024 11:10:40 -0500 Subject: [PATCH 36/38] separate task for macs3 consistent with create fragments --- pipelines/skylab/atac/atac.wdl | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 0a55334c15..e03edb7e2c 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -542,12 +542,6 @@ task CreateFragmentFile { set -euo pipefail set -x - # install packages -- need to add to the docker - pip3 install snapatac2==2.7.0 scanpy - - file_to_change=/usr/local/lib/python3.10/site-packages/snapatac2/tools/_call_peaks.py - sed -i '164s/.*/ peaks = [_call_peaks(x) for x in fragments.values()]/' $file_to_change - python3 < Date: Mon, 16 Dec 2024 09:40:14 -0500 Subject: [PATCH 37/38] Update atac.wdl update snap_atac_docker tag to 2.0.0 --- pipelines/skylab/atac/atac.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index e03edb7e2c..ffd4ae3f0e 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -61,7 +61,7 @@ workflow ATAC { String cutadapt_docker = "cutadapt:1.0.0-4.4-1686752919" String samtools_docker = "samtools-dist-bwa:3.0.0" String upstools_docker = "upstools:1.0.0-2023.03.03-1704300311" - String snap_atac_docker = "snapatac2:1.1.0" + String snap_atac_docker = "snapatac2:2.0.0" # Make sure either 'gcp' or 'azure' is supplied as cloud_provider input. If not, raise an error if ((cloud_provider != "gcp") && (cloud_provider != "azure")) { From b136ba30e94df8895f0def5592c012cd87be7628 Mon Sep 17 00:00:00 2001 From: Sid Cox Date: Mon, 16 Dec 2024 10:26:30 -0500 Subject: [PATCH 38/38] snapatac2 updates --- pipelines/skylab/atac/atac.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index ffd4ae3f0e..a2375ad802 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -551,7 +551,7 @@ task CreateFragmentFile { atac_gtf = "~{annotations_gtf}" preindex = "~{preindex}" atac_nhash_id = "~{atac_nhash_id}" - peakcalling_bool = True + peakcalling_bool = True # set peakcalling expected_cells = ~{atac_expected_cells} # calculate chrom size dictionary based on text file