-
Notifications
You must be signed in to change notification settings - Fork 1
/
Makefile
220 lines (176 loc) · 8.5 KB
/
Makefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
################################################################################
# (C) 2016 Tyler W. H. Backman
# Master makefile
################################################################################
# User Options -set up environment
cores = 1
databaseFile = working/bioassayDatabase.sqlite
# databaseFile = /dev/shm/bioassayDatabase.sqlite # uncomment for linux ramdisk
dataUrl = https://biocluster.ucr.edu/~tbackman
# drugBankUsername = putDrugBankEmailHere
# drugBankPassword = putDrugBankPasswordHere
##########################################
# download external dependencies
# Note: all external files should go here
##########################################
# download prebuilt bioassayR PubChem Bioassay database
working/bioassayDatabaseDownloaded.sqlite:
mkdir -p working
wget $(dataUrl)/bioassayR/pubchem_protein_only.sqlite -O $@ --no-check-certificate
# download bioassayR protein targets
working/targets.fasta:
mkdir -p working
wget $(dataUrl)/bioassayR/targets.fasta -O $@ --no-check-certificate
# download single SDF file of all active bioassayR compounds
working/bioassayCompounds.sdf:
mkdir -p working
wget $(dataUrl)/bioassayR/activeCompounds.sdf -O $@ --no-check-certificate
# download SDF files of all active bioassayR compounds
working/bioassayCompounds:
mkdir -p working
wget $(dataUrl)/bioassayR/activeCompoundsSplit.tgz -O [email protected] --no-check-certificate
cd working && tar xfz bioassayCompounds.tgz
mv working/splitFolder $@
rm working/bioassayCompounds.tgz
# get structures of FDA approved drugs
working/drugbank.sdf:
# curl -L -o [email protected] -u $(drugBankUsername):$(drugBankPassword) http://www.drugbank.ca/releases/5-0-1/downloads/approved-structures
# unzip [email protected] -d working/
# mv working/structures.sdf $@
wget $(dataUrl)/drugbank4.2/drugbank.sdf -O $@ --no-check-certificate
# download drugbank target sequences
working/drugbank_targets.fasta:
# mkdir -p working
# curl -L -o [email protected] -u $(drugBankUsername):$(drugBankPassword) http://www.drugbank.ca/releases/5-0-1/downloads/target-approved-polypeptide-sequences
# unzip [email protected] -d working/
# mv working/protein.fasta $@
wget $(dataUrl)/drugbank4.2/drugbank_targets.fasta -O $@ --no-check-certificate
# download DrugBank FDA Approved External Drug Links
working/drugbank_links.csv:
# curl -L -o [email protected] -u $(drugBankUsername):$(drugBankPassword) http://www.drugbank.ca/releases/5-0-1/downloads/approved-drug-links
# unzip [email protected] -d working/
# mv working/drug\ links.csv $@
wget $(dataUrl)/drugbank4.2/drugbank_links.csv -O $@ --no-check-certificate
# download annotated drugbank targets from DrugBank
working/drug_target_uniprot_links.csv:
# curl -L -o [email protected] -u $(drugBankUsername):$(drugBankPassword) http://www.drugbank.ca/releases/5-0-1/downloads/target-approved-uniprot-links
# unzip [email protected] -d working/
# mv working/uniprot\ links.csv $@
wget $(dataUrl)/drugbank4.2/drug_target_uniprot_links.csv -O $@ --no-check-certificate
# download Pfam-A HMMs in an HMM library searchable with the hmmscan program
working/Pfam-A.hmm:
wget -O [email protected] ftp://ftp.ebi.ac.uk/pub/databases/Pfam/releases/Pfam29.0/Pfam-A.hmm.gz
gunzip [email protected]
hmmpress $@
# download a tab separated file containing Pfam-A family and clan information for all Pfam-A families
working/Pfam-A.clans.tsv:
wget ftp://ftp.ebi.ac.uk/pub/databases/Pfam/releases/Pfam29.0/Pfam-A.clans.tsv.gz -O [email protected]
gunzip [email protected]
dos2unix $@
# download UniProtKB/Swiss-Prot human proteome
working/UP000005640_9606.fasta:
wget ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Eukaryota/UP000005640_9606.fasta.gz -O [email protected]
gunzip [email protected]
# download GO annotations for Pfam domains
working/pfam2go:
wget http://geneontology.org/external2go/pfam2go -O $@
# download Generic GO slim Developed by GO Consortium
working/goslim_generic.obo:
wget http://www.geneontology.org/ontology/subsets/goslim_generic.obo -O $@
##########################################
# Organize and process downloaded files
##########################################
# remove all multi-target assays
working/bioassayDatabaseSingleTarget.sqlite: src/singleTargetOnly.R working/bioassayDatabaseDownloaded.sqlite
cp working/bioassayDatabaseDownloaded.sqlite $@
$< $@
# create optional memory cached sqlite database if databaseFile path is changed
$(databaseFile): working/bioassayDatabaseSingleTarget.sqlite
cp -p $< $@
########################################################
# Structurally cluster FDA approved drugs as a reference
########################################################
working/structureClusterDrugs.tab: src/binningClustering.R working/drugbank.sdf working/drugbank_links.csv
$^ $@
#########################################################
# Cluster protein targets by similarity and annotate them
#########################################################
# combine drugbank targets and bioassay targets
working/combinedTargets.fasta: working/drugbank_targets.fasta working/targets.fasta
cp $< $@
cat working/targets.fasta >> $@
# cluster combined protein targets
working/combinedCluster: working/combinedTargets.fasta
mkdir -p $@
src/kClust -i $< -d $@ -s 2.93 -M 16000MB
# parse kclust output and fix identifier names
working/curatedClusters.txt: src/curateClusters.R working/combinedCluster
$^ $@
# annotate kclust clusters via biomaRt
# choose a representative protein target for each cluster with the following priority order:
# (1) already annotated in DrugBank, (2) human, (3) non-human
working/clusterAnnotations.csv: src/annotateProteinClusters.R working/curatedClusters.txt $(databaseFile)
$^ $@
# find GO terms for all annotated protein targets
working/clusterGOannotations.csv: src/clusterGOannotations.R working/clusterAnnotations.csv
$^ $@
# find GO Slim annotations
working/clusterGOslimAnnotations.csv: src/clusterGOslimAnnotations.R working/clusterGOannotations.csv working/goslim_generic.obo
$^ $@
# download ontology data from UniProt
working/gene_association.goa_uniprot:
wget ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/UNIPROT/goa_uniprot_all.gaf.gz -O [email protected]
gunzip [email protected]
##############################################
# find Pfam domains within all protein targets
##############################################
# find Pfam domains within all combined targets
working/combinedTargetDomains: working/Pfam-A.hmm working/combinedTargets.fasta
hmmscan -E 0.01 --domE 0.01 --tblout $@ --cpu $(cores) --noali $^
# simplify domain data into two columns
working/combinedTargetDomainsTwoCols: working/combinedTargetDomains
awk '{ if (!/^#/) print $$2 " " $$3}' $^ > $@
# get stats on pfam domains
working/Pfam-A-stats.txt: working/Pfam-A.hmm
hmmstat $< > $@
# get table of HMM residue lengths
working/PfamResidueLengths.tab: src/PfamResidueLengths.R working/Pfam-A-stats.txt
$^ $@
##############################################
# find Pfam domains within human proteome
##############################################
# find Pfam domains within all human targets
working/humanDomains: working/Pfam-A.hmm working/UP000005640_9606.fasta
hmmscan -E 0.01 --domE 0.01 --tblout $@ --cpu $(cores) --noali $^
# simplify domain data into two columns
working/humanDomainsTwoCols: working/humanDomains
awk '{ if (!/^#/) print $$2 " " $$3}' $^ > $@
######################################################
# Summarize bioactivity data
######################################################
# make list of cids screened at least 10 times
working/highlyScreenedCids.txt: src/highlyScreened.R $(databaseFile)
$^ $@ 10
# make list of all active compounds
working/activeCids.txt: src/getActives.R $(databaseFile)
$^ $@
# make matrix of highly screened compounds VS protein target clusters
working/cidsVStargetClusters.RData: src/cidsVStargetMatrix.R $(databaseFile) working/highlyScreenedCids.txt working/curatedClusters.txt
$^ $@ $(cores)
# make matrix of all compounds VS protein targets (not clustered)
working/cidsVStargets.RData: src/cidsVStargetMatrix.R $(databaseFile)
$^ none none $@ $(cores)
# use binary biclustering to extract the largest fully screened submatrix
working/fullyScreened.RData: src/extractFullyScreenedBicBin.R working/cidsVStargetClusters.RData
$^ $@ $(cores)
##########################################################
# include per-section makefiles
# Note: these should not depend on one
# another. Put all shared dependencies in this file.
##########################################################
include Makefile_dataQuality
include Makefile_targetSelectivity
include Makefile_promiscuityModel
include Makefile_annotationAndBiclustering
include Makefile_targetNetwork
include Makefile_SupportingInfo