diff --git a/.test/config/config.yaml b/.test/config/config.yaml index f4a0a21..0dc0ce5 100644 --- a/.test/config/config.yaml +++ b/.test/config/config.yaml @@ -28,7 +28,7 @@ small_variants: DV.vcf.gz #panel: '/cluster/work/pausch/alex/eQTL_GWAS/pangenie_panel.vcfwave.vcf' fastq: '' -HiFi: +HiFi_samples: sample1: hifi_1.fq.gz sample2: hifi_2.fq.gz @@ -55,6 +55,12 @@ covariates: Testis: 'eQTL.covar' sQTL: Testis: 'sQTL.covar' +mol_QTLs: + eQTL: + Testis: 'eQTL.TPM' + sQTL: + Testis: 'sQTL.clusters' + permutations: 2500 window: 1000000 #1 Mb diff --git a/.test/eQTL.TPM b/.test/eQTL.TPM new file mode 100644 index 0000000..e69de29 diff --git a/.test/sQTL.clusters b/.test/sQTL.clusters new file mode 100644 index 0000000..e69de29 diff --git a/README.md b/README.md index a221dc7..0b49244 100644 --- a/README.md +++ b/README.md @@ -6,9 +6,30 @@ Structural variants are known to play a large role in expression and splicing QT However, confidently calling stuctural variants in a sufficiently large population for association mapping is hard. Here we use [PanGenie](https://github.com/eblerjana/pangenie) to genotype a larger cohort (100s) of short reads using an accurate pangenome panel from haplotype-resolved assemblies (10s). -## Usage +### Usage +There are broadly three phases + - Pangenome panel (creating & genotyping) + - Variant analysis (statistics, linkage disequibrium, SV overlap, etc.) + - Association mapping of e/sQTL + +Running with +``` +snakemake --configfile config/config.yaml +``` +Will produce execute the following DAG + +![workflow](https://github.com/AnimalGenomicsETH/pangenome_molQTL/assets/29678761/bb0c73ca-fc31-4319-95e2-485da93f655a) + +which produces the major output files (e.g., accuracy comparison of PanGenie vs DeepVariant, SV overlap with Jasmine, conditional QTL analysis with QTLtools, etc.), which can then be independently analysed further. ### Citation The preprint associated with this work can be found [here](https://www.biorxiv.org/content/10.1101/2023.06.21.545879v1). +> **Pangenome genotyped structural variation improves molecular phenotype mapping in cattle** +> +> Alexander S. Leonard, Xena M. Mapel, Hubert Pausch + +### Note +Many of the parameters are tuned to run for our data and on the ETH Euler cluster, using for example a forked version of the LSF snakemake [profile](https://github.com/AnimalGenomicsETH/snakemake_lsf), so it may take some modifying to work smoothly in different contexts. Many tools are assumed to be available in $PATH, but all are freely available. + diff --git a/Snakefile b/Snakefile index b5f5e05..ac3e0d4 100644 --- a/Snakefile +++ b/Snakefile @@ -2,14 +2,20 @@ include: 'snakepit/pangenie_panel.smk' include: 'snakepit/pangenie_genotyping.smk' include: 'snakepit/variant_calling.smk' +include: 'snakepit/variant_accuracy.smk' include: 'snakepit/association_mapping.smk' include: 'snakepit/LD.smk' rule all: input: + ## Pangenome panel creation and genotyping 'pangenie_panel/multisample-vcfs/graph-filtered.vcf', 'pangenie_panel/samples.all.pangenie_genotyping_DV.vcf.gz', + ## Variant comparison between pangenome panel, short reads, and HiFi reads 'jasmine.vcf', - 'SVs/sizes.gz', + expand('SVs/{metric}.gz',metric=('sizes','support','GTs')), + expand('SNPs/{metric}.csv',metric=('F1','isec')), + ## Linkage disequilibrium analysis of SVs expand('LD/samples.SV.{r2}.{window}.tags.list',r2=list(range(2,10))+[99],window=(10,100,1000)), + ## Molecular QTL mapping expand('QTL/{QTL}/Testis_{variants}/conditionals.01.txt.gz',QTL=('eQTL','sQTL'),variants=('PanGenie','SR')) diff --git a/region_coverage.sh b/region_coverage.sh deleted file mode 100644 index 1dae455..0000000 --- a/region_coverage.sh +++ /dev/null @@ -1,3 +0,0 @@ -samtools bedcov <(echo -e "14\t1559308\t1605491") $(awk 'NR>205 {print "/cluster/work/pausch/inputs/bam/BTA_eQTL/"$2".bam"}' ../config/large_cohort.yaml) -for i in $(awk 'NR>205 {print $2}' ../config/large_cohort.yaml); do echo "\"${i}\":$(awk '/total/ {print $1*150/2700000000.0}' /cluster/work/pausch/inputs/bam/BTA_eQTL/${i}.stats)"; done |tr '\n' ',' -bcftools query -f '{["%SAMPLE":"%GT",]\n' -r 14:1559308 variants.normed.vcf.gz diff --git a/snakepit/SNPs.smk b/snakepit/SNPs.smk deleted file mode 100644 index d13ac84..0000000 --- a/snakepit/SNPs.smk +++ /dev/null @@ -1,50 +0,0 @@ -rule all: - input: - expand('{asm}.ARS.vcf',asm=config['assemblies']) - -rule minimap2_align: - input: - '/cluster/work/pausch/alex/assembly/SV_graph/fasta/{asm}.fasta' - output: - '{asm}.ARS.paf' - threads: 8 - resources: - mem_mb = 5000, - walltime = '2:00' - shell: - 'minimap2 -cx asm5 -t {threads} --cs {config[reference]} {input} > {output}' - -rule paf_variants: - input: - '{asm}.ARS.paf' - output: - '{asm}.ARS.vcf' - resources: - mem_mb = 10000, - walltime = '30' - shell: - 'sort {input} -k6,6 -k8,8n | paftools.js call -f {config[reference]} - > {output}' - - -rule prep_vcf: - input: - '{asm}.ARS.vcf' - output: - multiext('{asm}.ARS.snp.vcf.gz','','.tbi') - shell: - ''' - bcftools reheader -s <(echo {wildcards.asm}) {input} | bcftools view -O z -o {output[0]} -v snps - - tabix -p vcf {output[0]} - ''' - -rule bcftools_merge: - input: - expand('{asm}.ARS.snp.vcf.gz',asm=config['assemblies']) - output: - multiext('merged.ARS.snp.vcf.gz','','.tbi') - threads: 2 - resources: - mem_mb = 2000, - walltime = '30' - shell: - 'bcftools merge --threads 2 -O z -o {output[0]} {input}; tabix -p vcf {output[0]}' diff --git a/snakepit/association_mapping.smk b/snakepit/association_mapping.smk index 085990e..661f647 100644 --- a/snakepit/association_mapping.smk +++ b/snakepit/association_mapping.smk @@ -7,19 +7,6 @@ wildcard_constraints: MAF = r'\d+', vcf = r'(eQTL|gwas)/\S+' -rule concat_genes: - input: - lambda wildcards: expand('/cluster/work/pausch/xena/eQTL/gene_counts/{tissue_code}/QTLtools/{chromosome}_gene_counts.gz',chromosome=range(1,30),tissue_code={'Testis':'testis','Epididymis_head':'epi_h','Vas_deferens':'vas_d'}[wildcards.tissue]) if wildcards.QTL=='eQTL' else expand('/cluster/work/pausch/xena/eQTL/sQTL/{tissue_code}/removed/phenotypes/leafcutter.clus.{chromosome}.bed.gz',chromosome=range(1,30),tissue_code={'Testis':'testis','Epididymis_head':'epi_h','Vas_deferens':'vas_d'}[wildcards.tissue]) - output: - 'aligned_genes/{QTL}.{tissue}.bed.gz', - 'aligned_genes/{QTL}.{tissue}.bed.gz.tbi' - localrule: True - shell: - ''' - zcat {input} | sort -k1,1n -k2,2n | uniq | bgzip -@ 2 -c > {output[0]} - tabix -p bed {output[0]} - ''' - rule normalise_vcf: input: lambda wildcards: expand(rules.merge_with_population_SR.output,pangenie_mode='genotyping',allow_missing=True) if wildcards.variants == 'PanGenie' else config['small_variants'] @@ -60,7 +47,7 @@ rule qtltools_parallel: input: vcf = rules.normalise_vcf.output, exclude = rules.exclude_MAF.output, - bed = rules.concat_genes.output, + bed = lambda wildcards: config['mol_QTLs'][wildcards.QTL][wildcards.tissue], cov = lambda wildcards: config['covariates'][wildcards.QTL][wildcards.tissue], mapping = lambda wildcards: 'QTL/{QTL}/{tissue}_{variants}/permutations_all.{MAF}.thresholds.txt' if wildcards._pass == 'conditionals' else [] output: @@ -101,7 +88,7 @@ rule qtltools_FDR: 'r/4.2.2' shell: ''' - Rscript /cluster/work/pausch/alex/software/qtltools/scripts/qtltools_runFDR_cis.R {input} 0.05 {params.out} + Rscript qtltools_runFDR_cis.R {input} 0.05 {params.out} ''' rule LD: diff --git a/snakepit/igv_validate.smk b/snakepit/igv_validate.smk deleted file mode 100644 index 6b6d0d4..0000000 --- a/snakepit/igv_validate.smk +++ /dev/null @@ -1,41 +0,0 @@ - -rule slice_ARS: - input: - bam = '/cluster/work/pausch/inputs/bam/BTA_UCD12/{sample}.bam', - asm = '/cluster/work/pausch/alex/assembly/BSWCHEM120151536851/hifiasmv152_100/hap2.scaffolds.fasta' - output: - bam = get_dir('igv','{sample}.TBX3.ARS.bam'), - asm = get_dir('igv','{sample}.TBX3X.BSW.bam') - threads: 4 - resources: - mem_mb = 5000, - disk_scratch = 10, - walltime = '1:00' - shell: - ''' - samtools view -@ {threads} -o {output.bam} {input.bam} 17:60000000-61000000 - samtools fastq -@ {threads} {output.bam} | paste - - - - | sort -k1,1 -S 3G | tr '\t' '\n' > $TMPDIR/reads.fq - minimap2 -t {threads} --cs -axsr {input.asm} <(seqtk seq -1 $TMPDIR/reads.fq) <(seqtk seq -2 $TMPDIR/reads.fq) | samtools sort - -m 1000M -@ {threads} -o {output.asm} - samtools index -@ {threads} {output.bam} - samtools index -@ {threads} {output.asm} - ''' - -rule slice_BSW: - input: - fastq = expand('/cluster/work/pausch/inputs/fastq/BTA/{{sample}}_R{N}.fastq.gz',N=(1,2)), - asm = '/cluster/work/pausch/alex/assembly/BSWCHEM120151536851/hifiasmv152_100/hap2.scaffolds.fasta' - output: - full = multiext(get_dir('igv','{sample}.BSW.bam'),'','.bai'), - region = multiext(get_dir('igv','{sample}.TBX3.BSW.bam'),'','.bai') - threads: 24 - resources: - mem_mb = 5000, - disk_scratch=100 - shell: - ''' - minimap2 -t {threads} --cs -axsr {input.asm} {input.fastq} | samtools sort - -m 1000M -@ {threads} -T $TMPDIR -o {output.full[0]} - samtools index -@ {threads} {output.full[0]} - samtools view -@ {threads} -o {output.region[0]} {output.full[0]} 17:68200000-68700000 - samtools index -@ {threads} {output.region[0]} - ''' - diff --git a/snakepit/variant_accuracy.smk b/snakepit/variant_accuracy.smk index eac1d9c..5bb23dc 100644 --- a/snakepit/variant_accuracy.smk +++ b/snakepit/variant_accuracy.smk @@ -1,85 +1,14 @@ from pathlib import PurePath ## Add reference path for happy container -workflow.singularity_args = f'-B $TMPDIR -B {PurePath(config["reference"]).parent}' - -rule all: - input: - #expand('SVs/{sample}.denovo.sniffles.vcf.gz',sample=config['samples']), - #'SVs/samples.denovo.sniffles.vcf.gz', - 'SNPs/F1.csv', - 'SNPs/isec.csv' - -rule sniffles_call: - input: - bam = lambda wildcards: config['SV_samples'][wildcards.sample]#'/nfs/nas12.ethz.ch/fs1201/green_groups_tg_public/data/long-read_alignments/PacBio_CCS/eQTL/{sample}.mm2.cram' - output: - vcf = temp(multiext('SVs/{sample}.denovo.sniffles.vcf.gz','','.tbi')), - snf = temp('SVs/{sample}.denovo.sniffles.snf') - threads: 4 - resources: - mem_mb = 2500 - conda: - 'sniffles' - shell: - ''' - sniffles --input {input.bam} --reference {config[reference]} --sample-id {wildcards.sample} --phase --minsvlen 50 --threads {threads} --vcf {output.vcf[0]} --snf {output.snf} - ''' - -rule sniffles_merge: - input: - snfs = expand(rules.sniffles_call.output['snf'],sample=config['SV_samples'],allow_missing=True) - output: - vcf = multiext('SVs/samples.{call}.sniffles.vcf.gz','','.tbi') - threads: 2 - resources: - mem_mb = 2500 - conda: - 'sniffles' - shell: - ''' - sniffles --input {input.snfs} --reference {config[reference]} --threads {threads} --vcf {output.vcf[0]} - ''' - -rule sniffles_genotype: - input: - bam = '/cluster/work/pausch/alex/CCS_eQTL_cohort/ARS/{sample}.mm2.cram', - vcf = lambda wildcards: '' if wildcards.call == 'denovo' else '/cluster/work/pausch/alex/eQTL_GWAS/SV_ACCURACY/genotyping.vcf.gz' #'/cluster/work/pausch/alex/eQTL_GWAS/SV_ACCURACY/pangenie.vcf' - output: - vcf = temp(multiext('SVs/{sample}.{call}.sniffles.vcf.gz','','.tbi')) - params: - threads: 4 - resources: - mem_mb = 1500 - conda: - 'sniffles' - shell: - ''' - sniffles --input {input.bam} --genotype-vcf {input.vcf} --reference {config[reference]} --sample-id {wildcards.sample} --minsvlen 50 --threads {threads} --vcf {output.vcf} - ''' - -rule bcftools_stuff: - input: - 'SVs/samples.denovo.sniffles.vcf.gz' - output: - sizes = 'SVs/sizes.gz', - support = 'SVs/support.gz', - GTs = 'SVs/GTs.gz' - localrule: True - shell: - ''' - bcftools query -e 'INFO/SVTYPE=="BND"' -f '%INFO/SVLEN\\n' {input} | pigz -p2 -c > {output.sizes} - bcftools query -e 'INFO/SVTYPE=="BND"' -f '%INFO/SUPP_VEC\\n' {input} | mawk ' {{ print gsub(1,2,$1) }} ' | pigz -p2 -c > {output.support} - bcftools query -e 'INFO/SVTYPE=="BND"' -f '[%GT ]\\n' {input} | sed 's"|" "g' | sed 's"/" "g' | sed 's/\./nan/g' | pigz -p2 -c > {output.GTs} - ''' - +workflow._singularity_args = f'-B $TMPDIR -B {PurePath(config["reference"]).parent}' def get_variants(variant,caller): input_dict = { - 'SNPs_PG':'/cluster/work/pausch/alex/eQTL_GWAS/pangenie_8_wave/samples.all.pangenie_genotyping.vcf.gz', - 'SNPs_DV':'/cluster/work/pausch/alex/eQTL_GWAS/variants/DV-SR/cohort.autosomes.WGS.imputed.vcf.gz', - 'SVs_PG': '/cluster/work/pausch/alex/eQTL_GWAS/pangenie_panel.vcfwave.vcf.gz', - 'SVs_Sniffles': 'SVs/samples.denovo.sniffles.vcf.gz' + 'SNPs_PG':'variant_calling/panel.small.vcf.gz', + 'SNPs_DV':config['small_variants'], + 'SVs_PG': 'variant_calling/panel.SV.vcf', + 'SVs_Sniffles': 'variant_calling/samples.denovo.sniffles.vcf.gz' } return input_dict[f'{variant}_{caller}'] @@ -130,7 +59,7 @@ rule happy: others = temp(multiext('SNPs/{sample}','.bcf','.bcf.csi','.extended.csv','.roc.all.csv.gz','.runinfo.json')) params: _dir = lambda wildcards, output: PurePath(output.csv).with_suffix('').with_suffix('') - container: '/cluster/work/pausch/alex/software/images/hap.py_latest.sif' + container: 'docker://pkrusche/hap.py' threads: 2 resources: mem_mb = 5000, @@ -175,8 +104,7 @@ rule jasmine: pigz -dc {input.vcfs[0]} > $TMPDIR/SVs/{wildcards.sample}.PG.vcf pigz -dc {input.vcfs[1]} > $TMPDIR/SVs/{wildcards.sample}.Sniffles.vcf - java -Xmx6048m -jar /cluster/work/pausch/alex/software/Jasmine/jasmine.jar \ - --comma_filelist file_list={params._input} threads={threads} out_file=/dev/stdout out_dir=$TMPDIR \ + jasmine --comma_filelist file_list={params._input} threads={threads} out_file=/dev/stdout out_dir=$TMPDIR \ genome_file={config[reference]} --pre_normalize --ignore_strand --allow_intrasample --ignore_type \ max_dist_linear=1 max_dist=1000 > {output} #|\ @@ -185,7 +113,7 @@ rule jasmine: rule gather_jasmine: input: - expand(rules.jasmine.output[0],sample=config['SV_samples']) + expand(rules.jasmine.output[0],sample=config['HiFi_samples']) output: 'SVs/jasmine.csv' localrule: True diff --git a/snakepit/variant_calling.smk b/snakepit/variant_calling.smk index e5d9d3f..5bdc34e 100644 --- a/snakepit/variant_calling.smk +++ b/snakepit/variant_calling.smk @@ -1,6 +1,6 @@ rule minimap2_align: input: - lambda wildcards: config['HiFi'][wildcards.sample] + lambda wildcards: config['HiFi_samples'][wildcards.sample] output: bam = temp('alignments/{sample}.HiFi.bam'), csi = temp('alignments/{sample}.HiFi.bam.csi') @@ -32,7 +32,7 @@ rule sniffles_call: rule sniffles_merge: input: - snfs = expand('variant_calling/{sample}.sniffles.snf',sample=config['HiFi']) + snfs = expand('variant_calling/{sample}.sniffles.snf',sample=config['HiFi_samples']) output: vcf = 'variant_calling/samples.sniffles.vcf.gz' threads: 2 @@ -119,23 +119,6 @@ rule jasmine_intersect: #jasmine --comma_filelist file_list=smoove_SV/All_filter_type.vcf,eQTL_GWAS/variants/variant_calling/panel.SV.vcf threads=1 out_file=SR_LR.vcf out_dir=$TMPDIR spec_reads=0 genome_file=REF_DATA/ARS-UCD1.2_Btau5.0.1Y.fa min_seq_id=0 --pre_normalize --ignore_strand --allow_intrasample max_dist_linear=1 --normalize_type --dup_to_ins #bcftools query -f '%CHROM\t%POS\t%INFO/END\t%INFO/SVTYPE\t0\t+\t%POS\t%INFO/END\t%INFO/SUPP_VEC\n' SR_LR.vcf | grep -vE "(INV|TRA)" | sed s'/10$/50,200,50/g' | sed s'/11$/250,100,150/g' | sed s'/01$/50,20,250/g' >> SR_LR.bed -rule bcftools_isec: - input: - read = 'DV_small.vcf.gz',#'DV-{read}/cohort.autosomes.WGS.vcf.gz', - asm = 'variant_calling/panel.small.vcf.gz', - other = 'panel.raw.small.vcf.gz' - output: - isec = 'isec_{read}_{strictness}.txt', - _count = 'isec_{read}_{strictness}.count' - threads: 2 - resources: - mem_mb = 3000 - shell: - ''' - bcftools isec --threads {threads} -n +1 -o {output.isec} -c {wildcards.strictness} {input} - cut -f 5 {output.isec} | sort | uniq -c > {output._count} - ''' - ## Assembly rules: rule minimap2_align_asm: input: