From 99d40f168ecbbc588641695b32b1ba8eecd12519 Mon Sep 17 00:00:00 2001 From: Mitchell Robert Vollger Date: Wed, 31 Jul 2024 09:03:52 -0700 Subject: [PATCH] Development branch for the next release (#29) This is a PR for the next version of FIRE that will be more robust to denovo assemblies. Some feats: Save the Fiber-seq data in cram format Replace ucsc utilities with bigtools Replace some bedtools opts with gia Switch to polars for coverage calculation Rename config options to uppercase variables fix #28 incorrect counts in a plot fix #26 so that fire appears to be deterministic now; however, this is not yet guaranteed. --- INSTALL.md | 18 ++++ README.md | 20 +--- _config.yml | 20 ++++ config/README.md | 2 +- docs/README.md | 2 - fire | 2 +- workflow/Snakefile | 111 +++++++++++---------- workflow/envs/env.yaml | 6 +- workflow/envs/python.yaml | 2 +- workflow/envs/runner.yaml | 13 +++ workflow/rules/FDR-peaks.smk | 52 +++++----- workflow/rules/apply-model.smk | 77 +++++++++----- workflow/rules/common.smk | 61 +++++++++-- workflow/rules/coverages.smk | 58 ++++++----- workflow/rules/decorated-reads.smk | 34 ++++--- workflow/rules/peak-stats.smk | 24 +++-- workflow/rules/track-hub.smk | 69 +++++++------ workflow/scripts/cov.py | 46 +++++++-- workflow/scripts/fire-null-distribution.py | 4 +- workflow/scripts/peaks-vs-percent.R | 15 ++- workflow/templates/bed3.as | 6 ++ workflow/templates/bed9.as | 13 +++ 22 files changed, 425 insertions(+), 230 deletions(-) create mode 100644 INSTALL.md create mode 100644 _config.yml create mode 100644 workflow/envs/runner.yaml create mode 100644 workflow/templates/bed3.as create mode 100644 workflow/templates/bed9.as diff --git a/INSTALL.md b/INSTALL.md new file mode 100644 index 0000000000..1016ad6a0d --- /dev/null +++ b/INSTALL.md @@ -0,0 +1,18 @@ +# Install + +You will need **snakemake** which you can install using conda/mamba, e.g.: +``` +mamba create -c conda-forge -c bioconda -n snakemake 'snakemake>=8.4' +``` + +Finally, if you wish to distribute jobs across a cluster you will need to install the appropriate [snakemake executor plugin](https://snakemake.github.io/snakemake-plugin-catalog/). For example, to use SLURM you can install the `snakemake-executor-slurm` plugin using pip: +``` +pip install snakemake-executor-plugin-slurm +``` + +We recommend adding a snakemake conda prefix to your `bashrc`, e.g. in the Stergachis lab add: +```bash +export SNAKEMAKE_CONDA_PREFIX=/mmfs1/gscratch/stergachislab/snakemake-conda-envs +export APPTAINER_CACHEDIR=/mmfs1/gscratch/stergachislab/snakemake-conda-envs/apptainer-cache +``` +Then snakemake installs all the additional requirements as conda envs in that directory. diff --git a/README.md b/README.md index d4baba773f..8ff9af3b50 100644 --- a/README.md +++ b/README.md @@ -6,25 +6,7 @@ A Snakemake workflow for calling Fiber-seq Inferred Regulatory Elements (FIREs) ## Install -You will need **snakemake** and all the **UCSC Kent utilities** and the **latest version** of them (v455). - -You can install snakemake using conda/mamba, e.g.: -``` -mamba create -c conda-forge -c bioconda -n snakemake 'snakemake>=8.4' -``` - -You can find the UCSC kent utilities at [this url](http://hgdownload.soe.ucsc.edu/admin/exe/). You will need to add the directory containing the utilities to your `PATH` environment variable. - -Finally, if you wish to distribute jobs across a cluster you will need to install the appropriate [snakemake executor plugin](https://snakemake.github.io/snakemake-plugin-catalog/). For example, to use SLURM you can install the `snakemake-executor-slurm` plugin using pip: -``` -pip install snakemake-executor-plugin-slurm -``` - -We recommend adding a snakemake conda prefix to your `bashrc`, e.g. in the Stergachis lab add: -```bash -export SNAKEMAKE_CONDA_PREFIX=/mmfs1/gscratch/stergachislab/snakemake-conda-envs -``` -Then snakemake installs all the additional requirements as conda envs in that directory. +Please install `snakemake` and all the UCSC Kent utilities. For detailed instructions see the [installation README](/INSTALL.md). ## Configuring diff --git a/_config.yml b/_config.yml new file mode 100644 index 0000000000..ac3cdded7d --- /dev/null +++ b/_config.yml @@ -0,0 +1,20 @@ +#theme: jekyll-theme-minimal +#theme: minima +remote_theme: jekyll/minima +#logo: /assets/img/fiber_tools_grey.png + +minima: + skin: dark + +defaults: + - scope: + path: "README.md" + values: + permalink: "index.html" + +exclude: + - target + - models + - dists + - tmp.* + - temp diff --git a/config/README.md b/config/README.md index 46088df967..0d85d6cf14 100644 --- a/config/README.md +++ b/config/README.md @@ -10,7 +10,7 @@ Reference `fasta` file: ``` ref: /path/to/hg38.fa ``` -Manifest of input sample(s), must have two white-space separated columns: sample name (`sample`) and input bam file path (`bam`). See `config.tbl` for an example. +Manifest of input sample(s), must have two white-space separated columns: sample name (`sample`) and input bam file path (`bam`). See `config.tbl` for an example. The `bam` file must be indexed and aligned to the reference genome in the `ref` option. ``` manifest: config/config.tbl ``` diff --git a/docs/README.md b/docs/README.md index 2c0349a72d..17c1e543f7 100644 --- a/docs/README.md +++ b/docs/README.md @@ -102,9 +102,7 @@ The peak results are reported in the file `FDR-peaks/FDR-FIRE-peaks.bed.gz` with | | {sample}.{median, maximum,minimum}.coverage.txt | Allowed coverage range for analysis | | fiber-calls | | | | | FIRE.bed.gz | Every FIRE element in every fiber | -| | model.results.bed.gz | Every FIRE, linker, and nucleosome in every fiber | | | fire-fibers.bed.gz | bed12 start and end of every fiber | -| | fire-fiber-decorators.bed.gz | decorator file that adds annotations of FIRE elements to the bed12 file | | FDR-peaks | | | | | FDR-FIRE-peaks.bed.gz | Fiber-seq peak calls | | | FDR-wide-peaks.bed.gz | Fiber-seq wide peak calls | diff --git a/fire b/fire index aab4e405f8..e5a9611669 100755 --- a/fire +++ b/fire @@ -30,7 +30,7 @@ if [[ "${has_config}" == false ]]; then fi # check for executables -for x in snakemake bedToBigBed bedGraphToBigWig; do +for x in snakemake; do if [[ ! $(type -P "${x}") ]]; then echo "Error: ${x} not found in PATH, but it is required for FIRE." exit 1 diff --git a/workflow/Snakefile b/workflow/Snakefile index 0984c6850b..d26e470117 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -4,60 +4,64 @@ import sys import os from snakemake.utils import min_version -min_version("8.4.0") +min_version("8.12.0") + +# forces pandas pysam etc to be available in the environment +conda: "envs/runner.yaml" # Force users to use the same underlying OS via singularity. # container: "docker://condaforge/mambaforge:23.3.1-1" # container: "docker://continuumio/miniconda3" -ref = config["ref"] -ref_name = config["ref_name"] +# setup shared functions +include: "rules/common.smk" -dhs = config.get("dhs", workflow.source_path("annotations/GM12878_DHS.bed.gz")) -excludes = config.get("excludes", []) -if ref_name == "hg38" or ref_name == "GRCh38": - files = [ - "annotations/hg38.blacklist.ENCFF356LFX.bed.gz", - "annotations/hg38.gap.bed.gz", - "annotations/SDs.merged.hg38.bed.gz", - ] - excludes += [workflow.source_path(file) for file in files] -max_t = config.get("max_t", 4) -keep_chrs = config.get("keep_chromosomes", ".*") +# reference genome and reference regions +REF = get_ref() +FAI = get_fai() +REF_NAME = config["ref_name"] +EXCLUDES = get_excludes() +KEEP_CHRS = config.get("keep_chromosomes", ".*") # coverage requirements -min_coverage = config.get("min_coverage", 4) -coverage_within_n_sd = config.get("coverage_within_n_sd", 5) +MIN_COVERAGE = config.get("min_coverage", 4) +COVERAGE_WITHIN_N_SD = config.get("coverage_within_n_sd", 5) # sample, haplotype, and chromosome wildcard building -fai = pd.read_csv(f"{ref}.fai", sep="\t", names=["chr", "length", "x", "y", "z"]) -default_env = config.get("env", "../envs/env.yaml") -data = pd.read_csv(config["manifest"], sep=r"\s+", comment="#").set_index("sample") -haps = ["all", "hap1", "hap2", "unk"] -not_all = ["hap1", "hap2", "unk"] -not_unk = ["all", "hap1", "hap2"] -min_fire_fdr = config.get("min_fire_fdr", 0.10) +FAI_DF = get_fai_df() +DEFAULT_ENV = config.get("env", "../envs/env.yaml") +MANIFEST = get_manifest() +MIN_FIRE_FDR = config.get("min_fire_fdr", 0.10) # FDR / peak calling thresholds -max_peak_fdr = config.get("max_peak_fdr", 0.05) -min_per_acc_peak = config.get("min_per_acc_peak", None) -if min_per_acc_peak is None: - min_per_acc_peak = 0.0 +MAX_PEAK_FDR = config.get("max_peak_fdr", 0.05) +MIN_PER_ACC_PEAK = config.get("min_per_acc_peak", None) +if MIN_PER_ACC_PEAK is None: + MIN_PER_ACC_PEAK = 0.0 else: - max_peak_fdr = 1.0 - + MAX_PEAK_FDR = 1.0 +MIN_FRAC_ACCESSIBLE = config.get("min_frac_accessible", 0) +# Misc sets of wildcards +haps = ["all", "hap1", "hap2", "unk"] +not_unk = ["all", "hap1", "hap2"] types = ["fdr", "acc", "link", "nuc"] types_to_col = {"fdr": 4, "acc": 5, "link": 6, "nuc": 7} - bw_types = ["log_FDR"] # "score", "FDR", bw_types = bw_types + [f"{t}_H1" for t in bw_types] + [f"{t}_H2" for t in bw_types] el_types = ["fire", "linker", "nucleosome"] +# developer options +FT_EXE = config.get("ft_exe", "ft") +ONT = config.get("ont", False) +USE_ONT = "" +if ONT: + USE_ONT = " --ont --ml 225 " +MIN_UNRELIABLE_COVERAGE_LEN = config.get("min_unreliable_coverage_len", 50) + -include: "rules/common.smk" include: "rules/apply-model.smk" include: "rules/coverages.smk" include: "rules/FDR-peaks.smk" @@ -66,13 +70,10 @@ include: "rules/decorated-reads.smk" include: "rules/track-hub.smk" -chroms = get_chroms() - - wildcard_constraints: - chrom="|".join(chroms), + chrom="|".join(get_chroms()), call="|".join(["msp", "m6a"]), - sm="|".join(data.index), + sm="|".join(MANIFEST.index), types="|".join(types), fdr=r"\d+", hp="|".join(haps), @@ -83,39 +84,41 @@ wildcard_constraints: localrules: trackhub, + rule all: input: # coverage information - expand(rules.genome_bedgraph.output, sm=data.index), - expand(rules.coverage.output, sm=data.index), - expand(rules.exclude_from_shuffle.output, sm=data.index), - expand(rules.unreliable_coverage_regions.output, sm=data.index), + expand(rules.genome_bedgraph.output, sm=MANIFEST.index), + expand(rules.coverage.output, sm=MANIFEST.index), + expand(rules.exclude_from_shuffle.output, sm=MANIFEST.index), + expand(rules.unreliable_coverage_regions.output, sm=MANIFEST.index), # model results - expand(rules.fire_sites.output, sm=data.index), + expand(rules.fire_sites.output, sm=MANIFEST.index), # fiber locations - expand(rules.fiber_locations.output, sm=data.index), - expand(rules.filtered_and_shuffled_fiber_locations.output, sm=data.index), + expand(rules.fiber_locations.output, sm=MANIFEST.index), + expand(rules.filtered_and_shuffled_fiber_locations.output, sm=MANIFEST.index), # coverage of elements expand( rules.element_coverages.output, - sm=data.index, + sm=MANIFEST.index, hp=not_unk, el_type=el_types, ), # FDR results - expand(rules.fdr_track.output, sm=data.index), - expand(rules.fdr_peaks_by_fire_elements.output, sm=data.index), - expand(rules.wide_fdr_peaks.output, sm=data.index), - expand(rules.peaks_vs_percent.output, sm=data.index), - expand(rules.one_percent_fdr_peaks.output, sm=data.index), + expand(rules.fdr_track.output, sm=MANIFEST.index), + expand(rules.fdr_peaks_by_fire_elements.output, sm=MANIFEST.index), + expand(rules.wide_fdr_peaks.output, sm=MANIFEST.index), + expand(rules.peaks_vs_percent.output, sm=MANIFEST.index), + expand(rules.one_percent_fdr_peaks.output, sm=MANIFEST.index), + expand(rules.fires_in_peaks.output.txt, sm=MANIFEST.index), # trackhub - expand(rules.fdr_track_to_bw.output.bw, sm=data.index, col=bw_types), - expand(rules.fdr_peaks_by_fire_elements_to_bb.output.bb, sm=data.index), - expand(rules.percent_accessible.output.bw, hp=not_unk, sm=data.index), + expand(rules.fdr_track_to_bw.output.bw, sm=MANIFEST.index, col=bw_types), + expand(rules.fdr_peaks_by_fire_elements_to_bb.output.bb, sm=MANIFEST.index), + expand(rules.percent_accessible.output.bw, hp=not_unk, sm=MANIFEST.index), expand( rules.element_coverages_bw.output.bw, - sm=data.index, + sm=MANIFEST.index, hp=not_unk, el_type=el_types, ), - expand(rules.trackhub.output, sm=data.index), + expand(rules.trackhub.output, sm=MANIFEST.index), diff --git a/workflow/envs/env.yaml b/workflow/envs/env.yaml index 4ac1139dfe..7b5b60d441 100644 --- a/workflow/envs/env.yaml +++ b/workflow/envs/env.yaml @@ -7,7 +7,8 @@ dependencies: - samtools==1.19.1 - htslib==1.19.1 - bedtools==2.31 - - fibertools-rs>=0.3.9 + - bioconda::fibertools-rs==0.5.3 + - bioconda::gia - seqtk - hck>=0.9.2 - bioawk @@ -17,5 +18,4 @@ dependencies: - parallel - mosdepth==0.3.7 - bedops - #- ucsc-bedgraphtobigwig - #- ucsc-bedtobigbed + - bioconda::bigtools==0.5.1 diff --git a/workflow/envs/python.yaml b/workflow/envs/python.yaml index e686786892..8d338cbca7 100644 --- a/workflow/envs/python.yaml +++ b/workflow/envs/python.yaml @@ -8,7 +8,7 @@ dependencies: - tqdm - defopt - numpy==1.24 # use this version to avoid cxx errors - - numba::numba>=0.56 + - numba::numba==0.60 - pandas==1.4 # pandas versions later than this have cxx errors - pysam==0.21 - htslib==1.19.1 diff --git a/workflow/envs/runner.yaml b/workflow/envs/runner.yaml new file mode 100644 index 0000000000..211d7e503d --- /dev/null +++ b/workflow/envs/runner.yaml @@ -0,0 +1,13 @@ +name: runner +channels: + - numba + - conda-forge + - bioconda + - defaults +dependencies: + - tqdm + - numpy + - pandas + - pip + - pip: + - pysam diff --git a/workflow/rules/FDR-peaks.smk b/workflow/rules/FDR-peaks.smk index 09d652f1ac..bbe1fa331f 100644 --- a/workflow/rules/FDR-peaks.smk +++ b/workflow/rules/FDR-peaks.smk @@ -2,16 +2,16 @@ rule filtered_and_shuffled_fiber_locations_chromosome: input: filtered=rules.fiber_locations.output.filtered, exclude=rules.exclude_from_shuffle.output.bed, - fai=ancient(f"{ref}.fai"), + fai=ancient(FAI), output: shuffled=temp("temp/{sm}/coverage/{chrom}.fiber-locations-shuffled.bed.gz"), threads: 4 conda: - default_env + DEFAULT_ENV shell: """ tabix -h {input.filtered} {wildcards.chrom} \ - | bedtools shuffle -chrom \ + | bedtools shuffle -chrom -seed 42 \ -excl {input.exclude} \ -i - \ -g {input.fai} \ @@ -32,7 +32,7 @@ rule filtered_and_shuffled_fiber_locations: shuffled="results/{sm}/coverage/filtered-for-coverage/fiber-locations-shuffled.bed.gz", threads: 1 conda: - default_env + DEFAULT_ENV shell: """ cat {input.shuffled} > {output.shuffled} @@ -47,7 +47,7 @@ rule fdr_table: fire=rules.fire_sites.output.bed, fiber_locations=rules.fiber_locations.output.filtered, shuffled=rules.filtered_and_shuffled_fiber_locations.output.shuffled, - fai=ancient(f"{ref}.fai"), + fai=ancient(FAI), output: tbl="results/{sm}/FDR-peaks/FIRE.score.to.FDR.tbl", benchmark: @@ -70,7 +70,7 @@ rule fdr_track_chromosome: fire=rules.fire_sites.output.bed, fire_tbi=rules.fire_sites_index.output.tbi, fiber_locations=rules.fiber_locations.output.bed, - fai=ancient(f"{ref}.fai"), + fai=ancient(FAI), fdr_tbl=rules.fdr_table.output.tbl, output: fire=temp("temp/{sm}/FDR-peaks/{chrom}-fire.bed"), @@ -115,7 +115,7 @@ rule fdr_track: tbi="results/{sm}/FDR-peaks/FDR.track.bed.gz.tbi", threads: 8 conda: - default_env + DEFAULT_ENV shell: """ printf '\nMaking FOFN\n' @@ -147,7 +147,7 @@ rule fdr_track_filtered: tbi="results/{sm}/FDR-peaks/FDR.track.coverage.filtered.bed.gz.tbi", threads: 8 conda: - default_env + DEFAULT_ENV shell: """ MIN=$(cat {input.minimum}) @@ -173,10 +173,10 @@ rule helper_fdr_peaks_by_fire_elements: bed=temp("temp/{sm}/FDR-peaks/{chrom}-FDR-FIRE-peaks.bed.gz"), threads: 2 conda: - default_env + DEFAULT_ENV params: - max_peak_fdr=max_peak_fdr, - min_per_acc_peak=min_per_acc_peak, + max_peak_fdr=MAX_PEAK_FDR, + min_per_acc_peak=MIN_PER_ACC_PEAK, shell: """ HEADER=$(zcat {input.bed} | head -n 1 || true) @@ -226,7 +226,7 @@ rule fdr_peaks_by_fire_elements_chromosome: "../envs/python.yaml" params: script=workflow.source_path("../scripts/merge_fire_peaks.py"), - min_frac_accessible=config.get("min_frac_accessible", 0), + min_frac_accessible=MIN_FRAC_ACCESSIBLE, shell: """ zcat {input.bed} \ @@ -252,7 +252,7 @@ rule fdr_peaks_by_fire_elements: tbi="results/{sm}/FDR-peaks/FDR-FIRE-peaks.bed.gz.tbi", threads: 8 conda: - default_env + DEFAULT_ENV shell: """ printf "\nMaking FOFN\n" @@ -275,24 +275,21 @@ rule wide_fdr_peaks: input: bed=rules.fdr_peaks_by_fire_elements.output.bed, track=rules.fdr_track.output.bed, - fai=ancient(f"{ref}.fai"), + fai=ancient(FAI), output: bed="results/{sm}/FDR-peaks/FDR-wide-peaks.bed.gz", tbi="results/{sm}/FDR-peaks/FDR-wide-peaks.bed.gz.tbi", bb="results/{sm}/trackHub/bb/FDR-wide-peaks.bb", conda: - default_env + DEFAULT_ENV threads: 4 params: nuc_size=config.get("nucleosome_size", 147), - max_peak_fdr=max_peak_fdr, - min_frac_acc=max(config.get("min_frac_accessible", 0), min_per_acc_peak), + max_peak_fdr=MAX_PEAK_FDR, + min_frac_acc=max(MIN_FRAC_ACCESSIBLE, MIN_PER_ACC_PEAK), + bed3_as=workflow.source_path("../templates/bed3.as"), shell: """ - FILE={output.bed} - TMP="${{FILE%.*}}" - echo $TMP - ( \ zcat {input.bed}; \ bioawk -tc hdr 'NR==1 || $FDR<={params.max_peak_fdr}' {input.track} \ @@ -302,9 +299,14 @@ rule wide_fdr_peaks: | cut -f 1-3 \ | bedtools sort \ | bedtools merge -d {params.nuc_size} \ - > $TMP - bedToBigBed $TMP {input.fai} {output.bb} - bgzip -f -@ {threads} $TMP + | bgzip -@ {threads} \ + > {output.bed} + + bgzip -cd -@ 16 {output.bed} \ + | bigtools bedtobigbed \ + -s start -a {params.bed3_as} \ + - {input.fai} {output.bb} + tabix -p bed {output.bed} """ @@ -320,7 +322,7 @@ rule one_percent_fdr_peaks: wide_tbi="results/{sm}/FDR-peaks/one-percent-FDR/FDR-01-FIRE-wide-peaks.bed.gz.tbi", threads: 4 conda: - default_env + DEFAULT_ENV params: nuc_size=config.get("nucleosome_size", 147), shell: diff --git a/workflow/rules/apply-model.smk b/workflow/rules/apply-model.smk index bbe3bdf9a0..bd1e82a027 100644 --- a/workflow/rules/apply-model.smk +++ b/workflow/rules/apply-model.smk @@ -3,7 +3,7 @@ # rule fire: input: - bam=ancient(lambda wc: data.loc[wc.sm, "bam"]), + bam=ancient(get_input_bam), output: bam=temp("temp/{sm}/fire/{chrom}.fire.bam"), threads: 8 @@ -12,12 +12,14 @@ rule fire: params: min_msp=config.get("min_msp", 10), min_ave_msp_size=config.get("min_ave_msp_size", 10), + use_ont=USE_ONT, conda: - default_env + DEFAULT_ENV shell: """ samtools view -u -@ {threads} {input.bam} {wildcards.chrom} \ - | ft fire -t {threads} \ + | {FT_EXE} fire -t {threads} \ + {params.use_ont} \ --min-msp {params.min_msp} \ --min-ave-msp-size {params.min_ave_msp_size} \ --skip-no-m6a \ @@ -27,21 +29,29 @@ rule fire: rule merged_fire_bam: input: + ref=ancient(REF), + fai=ancient(FAI), bams=expand(rules.fire.output.bam, chrom=get_chroms(), allow_missing=True), output: - bam="results/{sm}/fire/{sm}.fire.bam", - bai="results/{sm}/fire/{sm}.fire.bam.bai", + cram="results/{sm}/fire/{sm}.fire.cram", + crai="results/{sm}/fire/{sm}.fire.cram.crai", threads: 16 resources: - mem_mb=8 * 1024, + mem_mb=16 * 1024, + runtime=300, conda: - default_env + DEFAULT_ENV benchmark: "results/{sm}/benchmarks/merged_fire_bam/{sm}.txt" shell: """ - samtools merge -@ {threads} -o {output.bam} {input.bams} - samtools index -@ {threads} {output.bam} + samtools merge -@ {threads} -u {input.bams} -o - \ + | samtools view \ + -C -@ {threads} \ + -T {input.ref} \ + --output-fmt-option embed_ref=1 \ + --write-index \ + -o {output.cram} """ @@ -52,13 +62,13 @@ rule extract_from_fire: bed=temp("temp/{sm}/chrom/{chrom}.sorted.bed.gz"), threads: 4 conda: - default_env + DEFAULT_ENV resources: mem_mb=16 * 1024, priority: 10 shell: """ - ft fire -t {threads} --extract {input.bam} \ + {FT_EXE} fire -t {threads} --all --extract {input.bam} \ | LC_ALL=C sort \ --parallel={threads} \ -k1,1 -k2,2n -k3,3n -k4,4 \ @@ -76,7 +86,7 @@ rule merge_model_results: bed=temp("temp/{sm}/fiber-calls/model.results.bed.gz"), threads: 8 conda: - default_env + DEFAULT_ENV params: n_chunks=len(get_chroms()) + 1, benchmark: @@ -94,7 +104,7 @@ rule index_model_results: output: tbi=temp(rules.merge_model_results.output.bed + ".tbi"), conda: - default_env + DEFAULT_ENV shell: """ tabix -p bed {input.bed} @@ -109,9 +119,9 @@ rule fire_sites_chrom: bed=temp("temp/{sm}/fiber-calls/{chrom}/FIRE.bed.gz"), threads: 4 conda: - default_env + DEFAULT_ENV params: - min_fdr=min_fire_fdr, + min_fdr=MIN_FIRE_FDR, shell: """ tabix {input.bed} {wildcards.chrom} \ @@ -132,9 +142,9 @@ rule fire_sites: bed="results/{sm}/fiber-calls/FIRE.bed.gz", threads: 8 conda: - default_env + DEFAULT_ENV params: - min_fdr=min_fire_fdr, + min_fdr=MIN_FIRE_FDR, shell: """ cat {input.beds} > {output.bed} @@ -148,7 +158,7 @@ rule fire_sites_index: tbi=rules.fire_sites.output.bed + ".tbi", threads: 1 conda: - default_env + DEFAULT_ENV shell: """ tabix -p bed {input.bed} @@ -159,13 +169,13 @@ rule split_by_hap_per_chrom: input: bed=rules.merge_model_results.output.bed, tbi=rules.index_model_results.output.tbi, - fai=f"{ref}.fai", + fai=ancient(FAI), output: both=pipe("temp/{sm}/coverage/all/{chrom}.bed"), H1=pipe("temp/{sm}/coverage/hap1/{chrom}.bed"), H2=pipe("temp/{sm}/coverage/hap2/{chrom}.bed"), conda: - default_env + DEFAULT_ENV resources: disk_mb=100, runtime=240, @@ -182,16 +192,16 @@ rule split_by_hap_per_chrom: rule split_hap_by_element_type_per_chrom: input: bed="temp/{sm}/coverage/{hp}/{chrom}.bed", - fai=f"{ref}.fai", + fai=ancient(FAI), output: fire=temp("temp/{sm}/coverage/{hp}/fire_{chrom}.bed.gz"), link=temp("temp/{sm}/coverage/{hp}/linker_{chrom}.bed.gz"), nuc=temp("temp/{sm}/coverage/{hp}/nucleosome_{chrom}.bed.gz"), params: - min_fire_fdr=min_fire_fdr, + min_fire_fdr=MIN_FIRE_FDR, threads: 2 conda: - default_env + DEFAULT_ENV resources: disk_mb=100, mem_mb=8 * 1024, @@ -209,6 +219,15 @@ rule split_hap_by_element_type_per_chrom: | awk '$10>1.0' \ | bedtools genomecov -bg -i - -g {input.fai} \ | bgzip > {output.nuc} + + # check if files are empty and if they are add a fake data line + for f in {output.fire} {output.link} {output.nuc}; do + HAS_LINES=$(zcat $f | head | grep -cv '^#') || true + if [ $HAS_LINES -eq 0 ]; then + printf "{wildcards.chrom}\\t0\\t1\\t0\\n" \ + | bgzip -@{threads} > $f + fi + done """ @@ -222,7 +241,7 @@ rule element_coverages_per_chrom: output: bed=temp("temp/{sm}/coverage/{hp}_{chrom}_element_coverages.bed.gz"), conda: - default_env + DEFAULT_ENV params: names="\t".join(el_types), resources: @@ -237,8 +256,12 @@ rule element_coverages_per_chrom: | bgzip -@{threads} \ > {output.bed} else - bedtools unionbedg -header -i {input.beds} -names {params.names} \ - | sed 's/^chrom/#chrom/' \ + # bedtools unionbedg -header -i {input.beds} -names {params.names} + # | sed 's/^chrom/#chrom/' + ( \ + printf "#chrom\\tstart\\tend\\t{params.names}\\n"; \ + gia unionbedg -s -i {input.beds} \ + ) \ | bgzip -@ {threads} \ > {output.bed} fi @@ -256,7 +279,7 @@ rule element_coverages: bed="results/{sm}/coverage/{hp}_element_coverages.bed.gz", tbi="results/{sm}/coverage/{hp}_element_coverages.bed.gz.tbi", conda: - default_env + DEFAULT_ENV threads: 1 shell: """ diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 1162115b3e..bcf267d4ba 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -5,20 +5,53 @@ import sys FIRST_REPORT = True +def get_ref(): + if "ref" not in config: + raise ValueError("FIRE: ref parameter is missing in config.yaml") + ref = config["ref"] + if not os.path.exists(ref): + raise ValueError(f"FIRE: reference file {ref} does not exist") + return ref + + +def get_fai(): + fai = f"{get_ref()}.fai" + if not os.path.exists(fai): + raise ValueError(f"FIRE: reference index file {fai} does not exist") + return fai + + +def get_excludes(): + excludes = config.get("excludes", []) + if REF_NAME == "hg38" or REF_NAME == "GRCh38": + files = [ + "../annotations/hg38.blacklist.ENCFF356LFX.bed.gz", + "../annotations/hg38.gap.bed.gz", + "../annotations/SDs.merged.hg38.bed.gz", + ] + excludes += [workflow.source_path(file) for file in files] + return excludes + + +def get_fai_df(): + fai = get_fai() + return pd.read_csv(fai, sep="\t", names=["chr", "length", "x", "y", "z"]) + + def get_chroms(): global FIRST_REPORT min_contig_length = config.get("min_contig_length", 0) - skipped_contigs = fai["chr"][fai["length"] < min_contig_length] + skipped_contigs = FAI_DF["chr"][FAI_DF["length"] < min_contig_length] if len(skipped_contigs) > 0 and FIRST_REPORT: print( f"WARNING: Skipping contigs with length < {min_contig_length:,}: {skipped_contigs}", file=sys.stderr, ) - chroms = fai["chr"][fai["length"] >= min_contig_length] + chroms = FAI_DF["chr"][FAI_DF["length"] >= min_contig_length] chroms = sorted([chrom for chrom in chroms if "chrUn_" not in chrom]) chroms = [chrom for chrom in chroms if "_random" not in chrom] - chroms = [chrom for chrom in chroms if re.fullmatch(keep_chrs, chrom)] + chroms = [chrom for chrom in chroms if re.fullmatch(KEEP_CHRS, chrom)] if FIRST_REPORT: FIRST_REPORT = False @@ -27,11 +60,27 @@ def get_chroms(): if len(chroms) == 0: raise ValueError( f"No chromosomes left after filtering. Check your keep_chromosomes parameter in config.yaml. " - f"Your fai file contains the following chromosomes: {fai['chr']}" + f"Your fai file contains the following chromosomes: {FAI_DF['chr']}" ) return chroms +def get_manifest(): + manifest = config.get("manifest") + if manifest is None: + raise ValueError("manifest parameter is missing in config.yaml") + if not os.path.exists(manifest): + raise ValueError(f"Manifest file {manifest} does not exist") + manifest = pd.read_csv(config["manifest"], sep=r"\s+", comment="#").set_index( + "sample" + ) + return manifest + + +def get_input_bam(wc): + return MANIFEST.loc[wc.sm, "bam"] + + def get_mem_mb(wildcards, attempt): if attempt < 3: return attempt * 1024 * 32 @@ -60,9 +109,9 @@ def grep_command_for_el_type(wc): if wc.el_type == "nucleosome": return "awk '$10>1.0'" elif wc.el_type == "linker": - return f"awk '$10<=1.0 && $10>{min_fire_fdr}'" + return f"awk '$10<=1.0 && $10>{MIN_FIRE_FDR}'" elif wc.el_type == "fire": - return f"awk '$10<={min_fire_fdr}'" + return f"awk '$10<={MIN_FIRE_FDR}'" else: raise ValueError(f"Unknown element type {wc.el_type}") diff --git a/workflow/rules/coverages.smk b/workflow/rules/coverages.smk index 0594c45000..990416c9af 100644 --- a/workflow/rules/coverages.smk +++ b/workflow/rules/coverages.smk @@ -3,8 +3,10 @@ # rule genome_bedgraph: input: - bam=rules.merged_fire_bam.output.bam, - bai=rules.merged_fire_bam.output.bai, + ref=ancient(REF), + fai=ancient(FAI), + cram=rules.merged_fire_bam.output.cram, + crai=rules.merged_fire_bam.output.crai, output: bg="results/{sm}/coverage/{sm}.bed.gz", tbi="results/{sm}/coverage/{sm}.bed.gz.tbi", @@ -12,12 +14,12 @@ rule genome_bedgraph: shadow: "minimal" conda: - default_env + DEFAULT_ENV benchmark: "results/{sm}/benchmarks/genome_bedgraph/{sm}.txt" shell: """ - mosdepth -t {threads} tmp {input.bam} + mosdepth -f {input.ref} -t {threads} tmp {input.cram} zcat tmp.per-base.bed.gz \ | LC_ALL=C sort --parallel={threads} -k1,1 -k2,2n -k3,3n -k4,4 \ | bgzip -@ {threads} \ @@ -37,12 +39,12 @@ rule coverage: "../envs/python.yaml" threads: 1 resources: - mem_mb=48 * 1024, + mem_mb=64 * 1024, benchmark: "results/{sm}/benchmarks/coverage/{sm}.txt" params: - coverage_within_n_sd=coverage_within_n_sd, - min_coverage=min_coverage, + coverage_within_n_sd=COVERAGE_WITHIN_N_SD, + min_coverage=MIN_COVERAGE, chroms=get_chroms(), script: "../scripts/cov.py" @@ -53,17 +55,18 @@ rule coverage: # rule fiber_locations_chromosome: input: - bam=rules.merged_fire_bam.output.bam, + cram=rules.merged_fire_bam.output.cram, + crai=rules.merged_fire_bam.output.crai, output: bed=temp("temp/{sm}/coverage/{chrom}.fiber-locations.bed.gz"), threads: 8 conda: - default_env + DEFAULT_ENV shell: """ # get fiber locations - (samtools view -@ {threads} -F 2308 -u {input.bam} {wildcards.chrom} \ - | ft extract -t {threads} -s --all - \ + (samtools view -@ {threads} -F 2308 -u {input.cram} {wildcards.chrom} \ + | {FT_EXE} extract -t {threads} -s --all - \ | hck -F '#ct' -F st -F en -F fiber -F strand -F HP ) \ | (grep -v "^#" || true) \ | bgzip -@ {threads} \ @@ -88,7 +91,7 @@ rule fiber_locations: filtered_tbi="results/{sm}/coverage/filtered-for-coverage/fiber-locations.bed.gz.tbi", threads: 4 conda: - default_env + DEFAULT_ENV shell: """ cat {input.fibers} > {output.bed} @@ -112,14 +115,14 @@ rule fiber_locations: rule exclude_from_shuffle: input: filtered=rules.fiber_locations.output.filtered, - fai=ancient(f"{ref}.fai"), + fai=ancient(FAI), output: bed="results/{sm}/coverage/exclude-from-shuffles.bed.gz", threads: 4 conda: - default_env + DEFAULT_ENV params: - exclude=excludes, + exclude=EXCLUDES, shell: """ @@ -140,28 +143,35 @@ rule unreliable_coverage_regions: bg=rules.genome_bedgraph.output.bg, minimum=rules.coverage.output.minimum, maximum=rules.coverage.output.maximum, - fai=ancient(f"{ref}.fai"), + fai=ancient(FAI), output: bed="results/{sm}/coverage/unreliable-coverage-regions.bed.gz", bed_tbi="results/{sm}/coverage/unreliable-coverage-regions.bed.gz.tbi", bb="results/{sm}/trackHub/bb/unreliable-coverage-regions.bb", threads: 4 + params: + min_len=MIN_UNRELIABLE_COVERAGE_LEN, + bed3_as=workflow.source_path("../templates/bed3.as"), conda: - default_env + DEFAULT_ENV shell: """ - FILE={output.bed} - TMP="${{FILE%.*}}" - MIN=$(cat {input.minimum}) MAX=$(cat {input.maximum}) zcat {input.bg} \ | awk '$4>0' \ | awk -v MAX="$MAX" -v MIN="$MIN" '$4 <= MIN || $4 >= MAX' \ | bedtools merge -i - \ - > $TMP - bedToBigBed $TMP {input.fai} {output.bb} - # compress - bgzip -f -@ {threads} $TMP + | awk '$3-$2 >= {params.min_len}' \ + | bgzip -@ {threads} \ + > {output.bed} + + # bigbed + bgzip -cd {output.bed} -@ {threads} \ + | bigtools bedtobigbed \ + -s start -a {params.bed3_as} \ + - {input.fai} {output.bb} + + # index tabix -f -p bed {output.bed} """ diff --git a/workflow/rules/decorated-reads.smk b/workflow/rules/decorated-reads.smk index c6fea82b52..d05a9fceea 100644 --- a/workflow/rules/decorated-reads.smk +++ b/workflow/rules/decorated-reads.smk @@ -1,6 +1,7 @@ rule decorate_fibers_chromosome: input: - bam=rules.merged_fire_bam.output.bam, + cram=rules.merged_fire_bam.output.cram, + crai=rules.merged_fire_bam.output.crai, output: bed=temp("temp/{sm}/decorate/{chrom}.bed.gz"), decorated=temp("temp/{sm}/decorate/{chrom}.dec.bed.gz"), @@ -8,11 +9,11 @@ rule decorate_fibers_chromosome: resources: mem_mb=get_large_mem_mb, conda: - default_env + DEFAULT_ENV shell: """ - samtools view -@ {threads} -u {input.bam} {wildcards.chrom} \ - | ft track-decorators -t {threads} --bed12 {output.bed} \ + samtools view -@ {threads} -u {input.cram} {wildcards.chrom} \ + | {FT_EXE} track-decorators -t {threads} --bed12 {output.bed} \ | sort -k1,1 -k2,2n -k3,3n -k4,4 \ | bgzip -@ {threads} \ > {output.decorated} @@ -26,7 +27,7 @@ rule decorate_fibers_1: chrom=get_chroms(), allow_missing=True, ), - fai=f"{ref}.fai", + fai=ancient(FAI), output: bed="results/{sm}/fiber-calls/fire-fibers.bed.gz", bb="results/{sm}/trackHub/bb/fire-fibers.bb", @@ -36,14 +37,17 @@ rule decorate_fibers_1: resources: runtime=240, conda: - default_env + DEFAULT_ENV params: bed_as=workflow.source_path("../templates/bed12_filter.as"), shell: """ cat {input.bed} > {output.bed} - bedToBigBed -allow1bpOverlap -type=bed12+ -as={params.bed_as} \ - {output.bed} {input.fai} {output.bb} + + bgzip -cd -@ {threads} {output.bed} \ + | bigtools bedtobigbed \ + -s start -a {params.bed_as} \ + - {input.fai} {output.bb} """ @@ -54,9 +58,8 @@ rule decorate_fibers_2: chrom=get_chroms(), allow_missing=True, ), - fai=f"{ref}.fai", + fai=ancient(FAI), output: - decorated=temp("temp/{sm}/fiber-calls/fire-fiber-decorators.bed.gz"), bb="results/{sm}/trackHub/bb/fire-fiber-decorators.bb", benchmark: "results/{sm}/benchmarks/decorate_fibers_2/{sm}.txt" @@ -64,12 +67,15 @@ rule decorate_fibers_2: resources: runtime=60 * 16, conda: - default_env + DEFAULT_ENV params: dec_as=workflow.source_path("../templates/decoration.as"), shell: """ - cat {input.decorated} > {output.decorated} - bedToBigBed -allow1bpOverlap -type=bed12+ -as={params.dec_as} \ - {output.decorated} {input.fai} {output.bb} + cat {input.decorated} \ + | bgzip -cd -@ {threads} \ + | rg -v '^#' \ + | bigtools bedtobigbed \ + -a {params.dec_as} -s start \ + - {input.fai} {output.bb} """ diff --git a/workflow/rules/peak-stats.smk b/workflow/rules/peak-stats.smk index dbca987abe..2348e42373 100644 --- a/workflow/rules/peak-stats.smk +++ b/workflow/rules/peak-stats.smk @@ -4,14 +4,14 @@ rule clustering_vs_null: input: bed=rules.fire_sites.output.bed, - fai=ancient(f"{ref}.fai"), + fai=ancient(FAI), output: tmp=temp("temp/{sm}/tmp.pre.calls.bed"), null=temp("temp/{sm}/null.calls.bed"), bed="results/{sm}/clustering-vs-null.bed.gz", threads: 8 conda: - default_env + DEFAULT_ENV shell: """ bgzip -cd -@{threads} {input.bed} | cut -f 1-3 > {output.tmp} @@ -25,20 +25,28 @@ rule clustering_vs_null: """ -rule percent_in_clusters: +rule fires_in_peaks: input: - bed=rules.clustering_vs_null.output.bed, - fire=rules.fdr_track_filtered.output.bed, + fire=rules.fire_sites.output.bed, + exclude=rules.unreliable_coverage_regions.output.bed, + peaks=rules.fdr_peaks_by_fire_elements.output.bed, output: - txt="results/{sm}/percent-in-clusters.txt", + tmp=temp("temp/{sm}/tmp.FIREs-in-peaks.bed"), + txt="results/{sm}/tables/FIREs-in-peaks.txt", threads: 8 conda: - default_env + DEFAULT_ENV params: script=workflow.source_path("../scripts/percent-in-clusters.sh"), shell: """ - bash {params.script} {input.bed} {input.fire} {output.txt} + bedtools intersect -sorted -a {input.fire} -b {input.exclude} -v > {output.tmp} + + echo "Total # of FIREs within normal coverage regions" >> {output.txt} + wc -l {output.tmp} >> {output.txt} + + echo "# of FIREs within peaks" >> {output.txt} + bedtools intersect -sorted -u -a {input.fire} -b {input.peaks} | wc -l >> {output.txt} """ diff --git a/workflow/rules/track-hub.smk b/workflow/rules/track-hub.smk index 8fb8ce7323..4a2eb5f868 100644 --- a/workflow/rules/track-hub.smk +++ b/workflow/rules/track-hub.smk @@ -1,7 +1,7 @@ rule percent_accessible: input: bed=rules.fdr_track.output.bed, - fai=ancient(f"{ref}.fai"), + fai=ancient(FAI), output: tmp=temp("temp/{sm}/{hp}/percent.accessible.bed"), bw="results/{sm}/trackHub/bw/{hp}.percent.accessible.bw", @@ -9,7 +9,7 @@ rule percent_accessible: tbi="results/{sm}/{hp}/percent.accessible.bed.gz.tbi", threads: 4 conda: - default_env + DEFAULT_ENV resources: mem_mb=get_mem_mb, params: @@ -24,7 +24,9 @@ rule percent_accessible: # skip if the file is empty if [[ -s {output.tmp} ]]; then - bedGraphToBigWig {output.tmp} {input.fai} {output.bw} + bigtools bedgraphtobigwig \ + --nzooms 10 -s start \ + {output.tmp} {input.fai} {output.bw} else touch {output.bw} fi @@ -37,84 +39,93 @@ rule percent_accessible: rule element_coverages_bw: input: bed=rules.element_coverages.output.bed, - fai=ancient(f"{ref}.fai"), + fai=ancient(FAI), output: - tmp=temp("temp/{sm}/trackHub/bw/{hp}.{el_type}.coverage.bed"), bw="results/{sm}/trackHub/bw/{hp}.{el_type}.coverage.bw", conda: - default_env + DEFAULT_ENV shell: """ - zcat {input.bed} | hck -f 1-3 -F {wildcards.el_type} > {output.tmp} - bedGraphToBigWig {output.tmp} {input.fai} {output.bw} + zcat {input.bed} \ + | hck -f 1-3 -F {wildcards.el_type} \ + | grep -v "^#" \ + | bigtools bedgraphtobigwig \ + -s start --nzooms 10 \ + - {input.fai} {output.bw} """ rule fdr_track_to_bw: input: bed=rules.fdr_track.output.bed, - fai=ancient(f"{ref}.fai"), + fai=ancient(FAI), output: bw="results/{sm}/trackHub/bw/{col}.bw", - tmp=temp("temp/{sm}/trackHub/bw/{col}.tmp.bed"), threads: 4 conda: - default_env + DEFAULT_ENV shell: """ - hck -z -f 1-3 -F {wildcards.col} {input.bed} > {output.tmp} - bedGraphToBigWig {output.tmp} {input.fai} {output.bw} + hck -z -f 1-3 -F {wildcards.col} {input.bed} \ + | grep -v "^#" \ + | bigtools bedgraphtobigwig \ + -s start --nzooms 10 \ + - {input.fai} {output.bw} """ rule fdr_peaks_by_fire_elements_to_bb: input: bed=rules.fdr_peaks_by_fire_elements.output.bed, - fai=ancient(f"{ref}.fai"), + fai=ancient(FAI), output: bb="results/{sm}/trackHub/bb/FDR-FIRE-peaks.bb", - tmp=temp("temp/{sm}/trackHub/bb/FDR-FIRE-peaks.bb.tmp"), threads: 4 conda: - default_env + DEFAULT_ENV params: bedfmt=workflow.source_path("../templates/fire_peak.as"), shell: """ zcat {input.bed} \ - | bioawk -tc hdr '{{print $1,$2,$3,"peak-"NR,int($score*10),".",$score,"-1",$log_FDR,int($start/2+$end/2)-$peak_start}}' \ - > {output.tmp} - bedToBigBed \ - -type=bed6+4 -as={params.bedfmt} \ - {output.tmp} {input.fai} {output.bb} + | bioawk -tc hdr '{{print $1,$2,$3,"peak-"NR,int($score*10),".",$score,"-1",$log_FDR,int($start/2+$end/2)-$peak_start}}' \ + | bioawk -tc hdr '$5<=1000' \ + | rg -v '^#' \ + | bigtools bedtobigbed \ + -a {params.bedfmt} -s start \ + - {input.fai} {output.bb} """ rule hap_differences_track: input: bed9=rules.hap_differences.output.bed9, - fai=f"{ref}.fai", + fai=ancient(FAI), output: - bed=temp("temp/{sm}/hap_differences/temp.bed"), bb="results/{sm}/trackHub/bb/hap_differences.bb", threads: 1 resources: mem_mb=get_mem_mb, conda: - default_env + DEFAULT_ENV params: chrom=get_chroms()[0], + bed9_as=workflow.source_path("../templates/bed9.as"), shell: """ - printf "{params.chrom}\t0\t1\tfake\t100\t+\t0\t1\t230,230,230\\n" > {output.bed} - bedtools sort -i {input.bed9} >> {output.bed} - bedToBigBed {output.bed} {input.fai} {output.bb} + ( \ + printf "{params.chrom}\t0\t1\tfake\t100\t+\t0\t1\t230,230,230\\n"; \ + bedtools sort -i {input.bed9} \ + ) \ + | bigtools bedtobigbed \ + -s start -a {params.bed9_as} \ + - {input.fai} {output.bb} """ rule trackhub: input: - fai=ancient(f"{ref}.fai"), + fai=ancient(FAI), fire=rules.fdr_peaks_by_fire_elements_to_bb.output.bb, cov=rules.coverage.output.cov, hap_diffs=rules.hap_differences_track.output.bb, @@ -129,7 +140,7 @@ rule trackhub: conda: "../envs/python.yaml" params: - ref=ref_name, + ref=REF_NAME, script=workflow.source_path("../scripts/trackhub.py"), shell: """ diff --git a/workflow/scripts/cov.py b/workflow/scripts/cov.py index 2ba89b97fb..6058fc1669 100644 --- a/workflow/scripts/cov.py +++ b/workflow/scripts/cov.py @@ -3,6 +3,7 @@ import sys import os import math +import polars as pl coverage_within_n_sd = snakemake.params.coverage_within_n_sd MIN_coverage = snakemake.params.min_coverage @@ -32,15 +33,42 @@ def weighted_median(df, val, weight): return comparison.iloc[0] -df = pd.read_csv( - snakemake.input.bg, - sep="\t", - header=None, - names=["chr", "start", "end", "coverage"], -) -df = df.loc[df["coverage"] > 0] -df = df.loc[df["chr"].isin(snakemake.params.chroms)] -df["weight"] = df["end"] - df["start"] +def pandas_read(): + df = pd.read_csv( + snakemake.input.bg, + sep="\t", + header=None, + names=["chr", "start", "end", "coverage"], + ) + df = df.loc[df["coverage"] > 0] + df = df.loc[df["chr"].isin(snakemake.params.chroms)] + df["weight"] = df["end"] - df["start"] + return df + + +def polars_read(): + # Reading the CSV file using the lazy API + df = ( + pl.read_csv( + snakemake.input.bg, + separator="\t", + has_header=False, + new_columns=["chr", "start", "end", "coverage"], + low_memory=True, + ) + .lazy() + .filter(pl.col("coverage") > 0) + .filter(pl.col("chr").is_in(snakemake.params.chroms)) + .drop("chr") + .with_columns((pl.col("end") - pl.col("start")).alias("weight")) + .drop(["start", "end"]) + .collect() + .to_pandas() + ) + return df + + +df = polars_read() print(df, file=sys.stderr) coverage = weighted_median(df, "coverage", "weight") diff --git a/workflow/scripts/fire-null-distribution.py b/workflow/scripts/fire-null-distribution.py index 8f4087dfcb..db9f563c95 100755 --- a/workflow/scripts/fire-null-distribution.py +++ b/workflow/scripts/fire-null-distribution.py @@ -87,7 +87,7 @@ def fire_scores_per_chrom( min_allowed_q=0.01, min_coverage=4, ): - fire_scores = np.zeros(chrom_length, dtype=np.float64) + fire_scores = np.zeros(int(chrom_length), dtype=np.float64) multi = -50.0 # a multi of -50 and a min_allowed_q of 0.01 gives a max score of 100 max_add = multi * np.log10(min_allowed_q) @@ -160,7 +160,7 @@ def get_coverage_from_array(starts, ends, coverage_array, stat="median"): @njit def make_coverage_array(starts, ends, chrom_length): - coverage_array = np.zeros(chrom_length, dtype=np.float64) + coverage_array = np.zeros(int(chrom_length), dtype=np.float64) for start, end in zip(starts, ends): coverage_array[start:end] += 1 return coverage_array diff --git a/workflow/scripts/peaks-vs-percent.R b/workflow/scripts/peaks-vs-percent.R index e3d9aff689..1caa2e081b 100644 --- a/workflow/scripts/peaks-vs-percent.R +++ b/workflow/scripts/peaks-vs-percent.R @@ -86,18 +86,23 @@ pecdf=df %>% by_5_per = df %>% mutate( - group = floor(acc_percent * 100) + group = floor(20 * acc_percent ) / 20 * 100 ) %>% - filter(group %% 5 == 0) %>% + #filter(group %% 5 == 0) %>% group_by(group) %>% - slice_max(order_by = count, n = 1) + filter(count == max(count)) %>% + slice_max(order_by = count, n = 1) %>% + select(group, acc_percent, count) + +print(by_5_per, nrow=25) + p5hist=by_5_per %>% - ggplot(aes(x=acc_percent-2.5/100, y=count)) + + ggplot(aes(x=(group+2.5)/100, y=count)) + geom_bar(stat="identity")+ geom_text_repel( aes( - x=acc_percent-2.5/100, + #x=acc_percent-2.5/100, label=paste( comma(count) ), diff --git a/workflow/templates/bed3.as b/workflow/templates/bed3.as new file mode 100644 index 0000000000..5119084b2f --- /dev/null +++ b/workflow/templates/bed3.as @@ -0,0 +1,6 @@ +table bed3 "bed3" +( +string chrom; "Reference sequence chromosome or scaffold" +uint chromStart; "Start position in chromosome" +uint chromEnd; "End position in chromosome" +) \ No newline at end of file diff --git a/workflow/templates/bed9.as b/workflow/templates/bed9.as new file mode 100644 index 0000000000..d4563f3d2a --- /dev/null +++ b/workflow/templates/bed9.as @@ -0,0 +1,13 @@ +table bed9 +"Bed9" +( +string chrom; "Chromosome (or contig, scaffold, etc.)" +uint chromStart; "Start position in chromosome" +uint chromEnd; "End position in chromosome" +string name; "Name of item" +uint score; "Score from 0-1000" +char[1] strand; "+ or -" +uint thickStart; "Start of where display should be thick (start codon)" +uint thickEnd; "End of where display should be thick (stop codon)" +uint color; "Primary RGB color for the decoration" +) \ No newline at end of file