-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMakefile
245 lines (188 loc) · 9.14 KB
/
Makefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
RESULTDIR = results
DATADIR = data
TASKS = \
fbbt-zfa \
fbbt-wbbt \
hsapdv-mmusdv \
mondo-ncit-renal-subset
METHODS = lexmatch logmap gpt3 gpt4
.PHONY: all all-matches combined-matches curated-mappings combined-curated-mappings
all:
echo "Generating all matches with the following methods: $(METHODS).."
$(MAKE) all-matches
echo "2/5 Generating combined results.."
$(MAKE) combined-matches
echo "3/5 Generating curated results"
$(MAKE) curated-mappings
echo "4/5 Generating combined results for curated data.."
$(MAKE) combined-curated-mappings
echo "5/5 Regenerate notebooks"
$(MAKE) all-notebooks
all-matches: $(foreach method, $(METHODS), $(foreach task, $(TASKS), $(RESULTDIR)/$(method)-$(task).sssom.tsv))
combined-matches: $(MAKE) $(foreach method, $(METHODS), $(RESULTDIR)/$(method)-combined.sssom.tsv)
curated-mappings: $(foreach task, $(TASKS), $(DATADIR)/curated-$(task).sssom.tsv)
combined-curated-mappings: $(DATADIR)/curated-combined.sssom.tsv
###############################################
############ Combining results ################
###############################################
$(RESULTDIR)/%-combined.sssom.tsv:
./util/concat-sssom.pl $(RESULTDIR)/$*-*.sssom.tsv > $@
.PRECIOUS: $(RESULTDIR)/%-combined.sssom.tsv
$(DATADIR)/%-combined.sssom.tsv:
./util/concat-sssom.pl $(DATADIR)/$*-*.sssom.tsv > $@
.PRECIOUS: $(DATADIR)/%-combined.sssom.tsv
###############################################
############ Generate lexical matches #########
###############################################
# Dealing with MONDO NCIT subset (special case, becuase the name of the task is different)
$(RESULTDIR)/lexmatch-mondo-ncit-renal-subset.sssom.tsv: subsets/mondo-rare-renal.obo subsets/ncit-renal.obo
runoak -i subsets/mondo-rare-renal.obo -a subsets/ncit-renal.obo lexmatch i^mondo: @ i^ncit: -o $@
$(RESULTDIR)/lexmatch-%.sssom.tsv:
$(eval ONT1=$(shell echo $* | cut -d- -f1))
$(eval ONT2=$(shell echo $* | cut -d- -f2))
$(eval DB1=$(shell echo sqlite:obo:$(ONT1)))
$(eval DB2=$(shell echo sqlite:obo:$(ONT2)))
runoak -i $(DB1) -a $(DB2) lexmatch i^$(ONT1): @ i^$(ONT2): -o $@
.PRECIOUS: $(RESULTDIR)/lexmatch-%.sssom.tsv
$(RESULTDIR)/loom-%.sssom.tsv:
$(eval ONT1=$(shell echo $* | cut -d- -f1))
$(eval ONT2=$(shell echo $* | cut -d- -f2))
$(eval DB1=$(shell echo sqlite:obo:$(ONT1)))
$(eval DB2=$(shell echo sqlite:obo:$(ONT2)))
runoak -i $(DB1) -a $(DB2) mappings -O sssom i^$(ONT1): .or i^$(ONT2) -M $(ONT2) --mapper bioportal: -o $@
.PRECIOUS: $(RESULTDIR)/loom-%.sssom.tsv
###############################################
############ Generate OntoGPT results #########
###############################################
SKIP_GPT = false
$(RESULTDIR)/gpt3-%.sssom.tsv: $(RESULTDIR)/lexmatch-%.sssom.tsv
if [ $(SKIP_GPT) = false ]; then ontogpt categorize-mappings -i $< -o $@ --yaml-output $(RESULTDIR)/gpt3-$*.results.yaml; fi
$(RESULTDIR)/gpt4-%.sssom.tsv: $(RESULTDIR)/lexmatch-%.sssom.tsv
if [ $(SKIP_GPT) = false ]; then ontogpt categorize-mappings --model gpt-4 -i $< -o $@ --yaml-output $(RESULTDIR)/gpt3-$*.results.yaml; fi
###############################################
############ Generate LOGMAP results ##########
###############################################
# https://stackoverflow.com/questions/41265266/how-to-solve-inaccessibleobjectexception-unable-to-make-member-accessible-m
LOGMAP_JAR=tools/logmap-matcher-4.0.jar
RUN_LOGMAP = java --add-opens java.base/java.lang=ALL-UNNAMED -jar $(LOGMAP_JAR) MATCHER
$(RESULTDIR)/logmap/work-%/logmap2_mappings.tsv:
$(eval ONT1=$(shell echo $* | cut -d- -f1))
$(eval ONT2=$(shell echo $* | cut -d- -f2))
$(MAKE) $(DATADIR)/$(ONT1).owl $(DATADIR)/$(ONT2).owl
mkdir -p $(RESULTDIR)/logmap/work-$*/
$(RUN_LOGMAP) file:$(PWD)/$(DATADIR)/$(ONT1).owl file:$(PWD)/$(DATADIR)/$(ONT2).owl $(PWD)/$(RESULTDIR)/logmap/work-$* false
test $@
# Due to only dealing with a subset of Mondo and NCIT, we have a special case here (because of the task name)
$(RESULTDIR)/logmap/work-mondo-ncit-renal-subset/logmap2_mappings.tsv: subsets/mondo-rare-renal.owl subsets/ncit-renal.owl
mkdir -p $(RESULTDIR)/logmap/work-mondo-ncit-renal-subset/ ;\
$(RUN_LOGMAP) file:$(PWD)/subsets/mondo-rare-renal.owl file:$(PWD)/subsets/ncit-renal.owl $(PWD)/$(RESULTDIR)/logmap/work-mondo-ncit-renal-subset false
$(RESULTDIR)/logmap-%.sssom.tsv: $(RESULTDIR)/logmap/work-%/logmap2_mappings.tsv
$(eval ONT1=$(shell echo $* | cut -d- -f1))
$(eval ONT2=$(shell echo $* | cut -d- -f2))
$(eval DB1=$(shell echo sqlite:obo:$(ONT1)))
$(eval DB2=$(shell echo sqlite:obo:$(ONT2)))
./util/fix-logmap.pl $< > [email protected] && runoak -i $(DB1) -a $(DB2) fill-table [email protected] --relation "{primary_key: subject_id, dependent_column: subject_label, relation: label}" --relation "{primary_key: object_id, dependent_column: object_label, relation: label}" --missing-value-token NONE | egrep -i '^(subject|$(ONT1).*$(ONT2))' > $@
###############################################
############ Rebuild notebooks ################
###############################################
.PHONY: notebook-% all-notebooks
notebook-%:
cd notebooks && make Mapping-Evaluation-$*.ipynb
all-notebooks:
$(MAKE) $(patsubst %, notebook-%, $(TASKS))
$(MAKE) notebook-combined
$(MAKE) content/vars.json
content/vars.json:
python util/combine_tables.py
# NOTEBOOKS
notebooks/%:
cd notebooks && make $*
###############################################
############ Test sets ########################
###############################################
# Instead of using the oak "mappings" command, this is using lexmatch with a strange intermedicate condition.
$(DATADIR)/curated-%.sssom.tsv:
$(eval ONT1=$(shell echo $* | cut -d- -f1))
$(eval ONT2=$(shell echo $* | cut -d- -f2))
$(eval DB1=$(shell echo sqlite:obo:$(ONT1)))
$(eval DB2=$(shell echo sqlite:obo:$(ONT2)))
runoak -i $(DB1) -a $(DB2) -a sqlite:obo:uberon -a sqlite:obo:cl lexmatch -L $(DATADIR)/index-$*.yaml -R conf/curated-rules.yaml i^$(ONT1): @ i^$(ONT2): -o $@
$(DATADIR)/curated-mondo-ncit-renal-subset.sssom.tsv: subsets/mondo-rare-renal.obo subsets/ncit-renal.obo
runoak -i subsets/mondo-rare-renal.obo -a subsets/ncit-renal.obo lexmatch -L $(DATADIR)/index-mondo-ncit-renal-subset.yaml -R conf/curated-rules-mondo.yaml i^mondo: @ i^ncit: -o $@
# Doesnt this make more sense here?
#$(DATADIR)/curated-mondo-ncit-renal-subset.sssom.tsv:
# runoak -i sqlite:obo:mondo mappings .desc//p=i 'kidney disease' .and .desc//p=i 'hereditary disease' -O sssom -o $@
###############################################
############ Reports as Pivot table ###########
###############################################
$(RESULTDIR)/report_pivot_table.md:
python util/reports.py -o $@
###############################################
############ Ontologies #######################
###############################################
subsets/mondo-rare-renal.obo:
mkdir -p subsets/
runoak -i sqlite:obo:mondo extract .desc//p=i 'kidney disease' .and .desc//p=i 'hereditary disease' -o $@
subsets/ncit-renal.obo:
mkdir -p subsets/
runoak -i sqlite:obo:ncit extract .desc//p=i 'Kidney Disorder' -o $@
subsets/%.owl: subsets/%.obo
robot convert -i $< -o $@
$(DATADIR)/%.owl:
curl -L -s http://purl.obolibrary.org/obo/$*.owl > $@
###############################################
############ Install pre-requisites ###########
###############################################
pip_uninstall:
@read -p "This will uninstall _all_ python tools in your environment. Do you want to proceed? [Y/n] " REPLY; \
if [ "$$REPLY" != "Y" ]; then \
echo "Aborting."; \
exit 1; \
fi
pip freeze > uninstall.txt
pip uninstall -r uninstall.txt -y
rm -f uninstall.txt
pip_install:
pip install -r build/requirements.txt
LOGMAP=https://github.com/ernestojimenezruiz/logmap-matcher/releases/download/logmap-matcher-july-2021/logmap-matcher-standalone-july-2021.zip
install_logmap:
mkdir -p tools
mkdir -p tools/logmap
rm -rf tools/logmap/*
wget "$(LOGMAP)" -O tools/logmap.zip
unzip tools/logmap.zip -d tools/logmap
cp tools/logmap/logmap-matcher-4.0.jar $(LOGMAP_JAR)
cp -r tools/logmap/java-dependencies tools/java-dependencies
###############################################
############# Development tools ###############
###############################################
# Section can be ignored
activate:
conda activate manubot
ue:
conda env update --file build/environment.yml --prune
nb:
jupyter notebook
###############################################
############# Utilities #######################
###############################################
.PHONY: help
help:
@echo "$$data"
define data
Usage: make [(IMP|MIR|IMP_LARGE|PAT)=(false|true)] command
----------------------------------------
Explanation
----------------------------------------
To run the pipeline, first install pre-requisites. See below in the command reference,
but usually you would run "make pip_install install_logmap".
----------------------------------------
Command reference
----------------------------------------
Core commands:
* task-all-combined: Run the entire release pipeline. Use make IMP=false prepare_release to avoid rerunning the imports
Installing pre-requisites:
* pip_install: Installs are the python dependencies required by the MapperGPT pipeline
* install_logmap: Downloads the logmatch tool
endef
export data