-
Notifications
You must be signed in to change notification settings - Fork 0
/
Snakefile
88 lines (73 loc) · 2.55 KB
/
Snakefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
configfile: "config.yml"
extract_esm_script = "esm/scripts/extract.py"
#quickgo_gaf = "databases/QuickGO-annotations-1697773507369-20231020.gaf"
#quickgo_expanded = "input/quickgo_expanded.tsv.gz"
#goa_parsed = 'databases/goa_parsed.tsv.gz'
#goa_parsed_expanded = 'databases/goa_parsed_expanded.tsv.gz'
go_basic = "databases/go-basic.obo"
release_dir = config['release_dir']
uniprot_fasta = release_dir+"/databases/uniprot_sprot.fasta.gz"
goa_parsed_mf = release_dir+"/databases/goa_parsed_expanded.mf.tsv.gz"
#proteins_for_learning = "input/proteins.fasta"
annotation_path = release_dir+'/annotation.tsv'
taxon_profile_path = release_dir+'/taxa_profile.tsv.gz'
esm_features_prefix = release_dir+'/esm2_t'
esm_model_ids = [str(x) for x in config['esm_models_to_use']]
esm_features_paths = [esm_features_prefix+x+'.npy' for x in esm_model_ids]
labels_path = release_dir+'/go_labels.tsv'
ids_path = release_dir+'/ids.txt'
conda_run1 = "conda run -n dimension_db --live-stream"
rule download_go:
output:
go_basic
shell:
"cd databases && wget https://current.geneontology.org/ontology/subsets/gocheck_do_not_annotate.json"
" && wget https://purl.obolibrary.org/obo/go/go-basic.obo"
rule download_esm:
output:
extract_esm_script
shell:
"rm -rf esm && git clone [email protected]:facebookresearch/esm.git"
rule create_release_dir:
output:
release_dir
rule download_uniprot:
output:
uniprot_fasta
shell:
"mkdir -p release_dir && wget -O "
+ uniprot_fasta
+ " https://ftp.uniprot.org/pub/databases/uniprot/knowledgebase/complete/uniprot_sprot.fasta.gz"
rule download_goa:
input:
'evi_not_to_use.txt',
go_basic,
uniprot_fasta
output:
goa_parsed_mf
shell:
"mkdir -p release_dir && " + conda_run1 + " python src/download_annotation.py"
'''rule annotated_protein_list:
input:
goa_parsed_mf,
uniprot_fasta
output:
proteins_for_learning,
annotation_path
shell:
"conda run --live-stream -n plm python src/create_train_protein_set.py"
rule create_features:
input:
proteins_for_learning,
extract_esm_script
output:
input_features_ids_path
shell:
"conda run --live-stream -n plm python src/calc_features.py "+proteins_for_learning+" input/features"
rule create_taxon_profiles:
input:
input_features_ids_path
output:
features_taxon_profile_path
shell:
"conda run --live-stream -n plm python src/calc_taxon_dist.py"'''