-
Notifications
You must be signed in to change notification settings - Fork 1
/
Snakefile
119 lines (110 loc) · 4.46 KB
/
Snakefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
# ref_snake
## Snakemake pipeline for genomic reference management
import pandas as pd
########################################################################################################
# Config file
########################################################################################################
configfile:"config/config.yml"
########################################################################################################
# Directories and locations
########################################################################################################
TMPDIR = config["TMPDIR"]
OUTDIR = config["OUTDIR"]
########################################################################################################
# Executables & params
########################################################################################################
EXEC = config["EXEC"]
SPECIES = config["SPECIES"].split()
########################################################################################################
# Wildcard constraints
########################################################################################################
wildcard_constraints:
SPECIES = "[a-z_]+",
BIOTYPE = "[a-zA-Z]+",
OUTDIR = config["OUTDIR"]
########################################################################################################
# Rules
########################################################################################################
include: "rules/utils.smk"
## Pre-run set up
include: "rules/0_gget_species.smk"
include: "rules/0_download_raw.smk"
include: "rules/0_process_raw.smk"
include: "rules/0_subset_biotype.smk"
## Rules for each aligner
include: "rules/1_pseudo_cellranger.smk"
include: "rules/1_star.smk"
include: "rules/1_kallisto.smk"
# include: "rules/1_bowtie.smk"
include: "rules/1_bwa.smk"
include: "rules/1_minimap2.smk"
########################################################################################################
# Target files
########################################################################################################
rule all:
input:
expand( # pseudo-cellranger for raw genome files
"{OUTDIR}/{SPECIES}/{BIOTYPE}/pseudo_cellranger/fasta/genome.fa",
OUTDIR=config["OUTDIR"],
BIOTYPE=["genome"],
SPECIES=SPECIES
),
expand( # kallisto-bustools reference(s)
"{OUTDIR}/{SPECIES}/{BIOTYPE}/{WORKFLOW}/transcriptome.idx",
OUTDIR=config["OUTDIR"],
WORKFLOW=["kb", "kb_velo", "kb_nuc", "kb_primary", "kb_velo_primary", "kb_nuc_primary"],
BIOTYPE=["transcriptome"],
SPECIES=SPECIES
),
expand( # STAR reference
"{OUTDIR}/{SPECIES}/{BIOTYPE}/STAR/Genome",
OUTDIR=config["OUTDIR"],
BIOTYPE=["genome","genome_primary","rRNA"],
SPECIES=SPECIES
),
expand( # minimap2 index
"{OUTDIR}/{SPECIES}/{BIOTYPE}/minimap2/target.mmi",
OUTDIR=config["OUTDIR"],
BIOTYPE=["genome", "transcriptome"],
SPECIES=SPECIES
),
expand( # bwa-mem2 index
"{OUTDIR}/{SPECIES}/{BIOTYPE}/bwa_mem2/ref.fa.gz{FILE}",
OUTDIR=config["OUTDIR"],
BIOTYPE=["genome","transcriptome","rRNA"],
FILE = [".amb", ".ann", ".bwt.2bit.64", ".pac", ".0123"],
SPECIES=SPECIES
),
expand( # Raw data and metadata for each species
"{OUTDIR}/{SPECIES}/raw/{FILE}",
OUTDIR=config["OUTDIR"],
FILE = [
"metadata.json",
"genome.fa.gz",
"genome.fa.fai",
"transcriptome.fa.gz",
"annotations.gtf.gz",
"annotations.bed",
"cds.fa.gz",
"ncrna.fa.gz",
"pep.fa.gz",
"chrom_sizes.tsv",
"gene_info.tsv",
"transcript_info.tsv",
# "rRNA.fa.gz"
],
SPECIES=SPECIES
),
expand( # Raw data and metadata for each biotype
"{OUTDIR}/{SPECIES}/{BIOTYPE}/raw/{FILE}",
OUTDIR=config["OUTDIR"],
FILE = [
"ref.fa.gz",
"annotations.gtf.gz"
],
BIOTYPE=["rRNA"], #TODO- miRNA, tRNA, etc.
SPECIES=SPECIES
),
[ # list of species supported by gget
"resources/gget_species.txt"
]