-
Notifications
You must be signed in to change notification settings - Fork 1
/
Makefile
158 lines (134 loc) · 5.66 KB
/
Makefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
### Input variables and their default values
engine ?= llama#gpt-4
temperature ?= 1.0
colbert_endpoint ?= http://127.0.0.1:5000/search
local_engine_endpoint ?= http://127.0.0.1:5002
local_engine_prompt_format ?= simple# 'none', 'alpaca' or 'simple'
experiment_id ?= mturk_1# used in db-to-text to extract mturk experiments data
pipeline ?= early_combine_with_replacement# one of: verify_and_correct, retrieve_and_generate, generate, retrieve_only, early_combine, atlas
evi_num ?= 2# num evi for each subclaim verification; 3 for retrieval
reranking_method ?= date# none or date
do_refine ?= true
skip_verification ?= false
debug_mode ?= false
refinement_prompt ?= prompts/refine.prompt
PIPELINE_FLAGS = --pipeline $(pipeline) \
--engine $(engine) \
--local_engine_endpoint $(local_engine_endpoint) \
--local_engine_prompt_format $(local_engine_prompt_format) \
--claim_prompt_template_file prompts/split_claim_rewrite.prompt \
--refinement_prompt $(refinement_prompt) \
--colbert_endpoint $(colbert_endpoint) \
--reranking_method $(reranking_method) \
--evi_num $(evi_num) \
--temperature $(temperature)
ifeq ($(do_refine), true)
PIPELINE_FLAGS := $(PIPELINE_FLAGS) --do_refine
else
PIPELINE_FLAGS := $(PIPELINE_FLAGS)
endif
ifeq ($(skip_verification), true)
PIPELINE_FLAGS := $(PIPELINE_FLAGS) --skip_verification
else
PIPELINE_FLAGS := $(PIPELINE_FLAGS)
endif
ifeq ($(debug_mode), true)
PIPELINE_FLAGS := $(PIPELINE_FLAGS) --debug_mode
else
PIPELINE_FLAGS := $(PIPELINE_FLAGS)
endif
.PHONY: demo chat-batch simulate-users start-colbert start-backend download-and-index-wiki download-latest-wiki extract-wiki split-wiki index-wiki db-to-file index-index_domain_corpus start-colbert-domain demo-domain
demo:
python chat_interactive.py \
$(PIPELINE_FLAGS) \
--output_file data/demo.txt \
start-colbert:
python colbert_app.py \
--colbert_index_path $(colbert_index_path) \
--colbert_checkpoint workdir/colbert_model \
--colbert_collection_path $(colbert_collection_path)
start-backend:
python server_api.py \
$(PIPELINE_FLAGS) \
# --test
# --no_logging
db-to-file:
python db_analytics.py --experiment_id $(experiment_id) --output_file data/db_analytics/$(experiment_id).json
##### StackExchange #####
domain ?= cooking# TODO: modify this to process your domain of interest.
process-stackexchange-corpus: workdir/
wget -O workdir/$(domain).stackexchange.com.7z https://archive.org/download/stackexchange/$(domain).stackexchange.com.7z && \
cd stackexchange && \
python unzip_7z.py --zip_file ../workdir/$(domain).stackexchange.com.7z --result_dir ../workdir/$(domain) && \
cd PyStack/pre_processing && \
python pystack.py --input ../../../workdir/$(domain) --task all && \
cd ../.. && \
python build_corpus.py --input ../workdir/$(domain) --domain $(domain)
##### ColBERT indexing #####
nbits ?= 2# encode each dimension with 2 bits
max_block_words ?= 100# maximum number of words in each paragraph
doc_maxlen ?= 140# number of "tokens", to account for (100 "words" + title) that we include in each wikipedia paragraph
checkpoint ?= ./workdir/colbert_model
split ?= all# '1m' passages or 'all'
experiment_name ?= wikipedia_$(split)
#index_name ?= wikipedia.$(split).$(nbits)bits
wiki_date ?= 07_20_2023
language ?= en
#collection ?= ./workdir/$(language)/wikipedia_$(wiki_date)/collection_$(split).tsv
collection ?= ./workdir/cooking/cooking_collection.tsv
#collection ?= ./stackexchange/PyStack/dataset/$(domain)/$(domain)_collection_test_index.tsv
nranks ?= 1# number of GPUs to use
#test_query ?= "What ingredients can I substitute for parsley?"
#test_query ?= "What should I do if my F-1 visa becomes out-of-date during my PhD study in the US? "
test_query ?= "When should I use unique_ptr in C++?"
index_name = $(domain).$(nbits)bits
# takes ~70 minutes. using `axel` instead of `wget` can speed up the download
download-latest-wiki: workdir/
wget -O workdir/$(language)wiki-latest-pages-articles.xml.bz2 https://dumps.wikimedia.org/$(language)wiki/latest/$(language)wiki-latest-pages-articles.xml.bz2
# takes ~5 hours
extract-wiki: workdir/$(language)wiki-latest-pages-articles.xml.bz2
python -m wikiextractor.wikiextractor.WikiExtractor workdir/$(language)wiki-latest-pages-articles.xml.bz2 --output workdir/$(language)/text/
# takes ~1 minute depending on the number of cpu cores and type of hard disk
split-wiki: workdir/$(language)/text/
python wikiextractor/split_passages.py \
--input_path workdir/$(language)/text/ \
--output_path $(collection) \
--max_block_words $(max_block_words)
# takes 24 hours on a 40GB A100 GPU
index-wiki: $(collection)
python index_wiki.py \
--nbits $(nbits) \
--doc_maxlen $(doc_maxlen) \
--checkpoint $(checkpoint) \
--split $(split) \
--experiment_name $(experiment_name) \
--index_name $(index_name) \
--collection $(collection) \
--nranks $(nranks)
tar-wiki-index:
tar -cvf colbert_wikipedia_index_$(wiki_date).tar workdir/wikipedia_$(wiki_date)/
# Below commands are newly added by Yijia
index-domain-corpus: $(collection)
python index_domain_corpus.py \
--nbits $(nbits) \
--doc_maxlen $(doc_maxlen) \
--checkpoint $(checkpoint) \
--experiment_name $(domain) \
--index_name $(index_name) \
--collection $(collection) \
--nranks $(nranks) \
--test_query $(test_query)
start-colbert-domain:
python colbert_app.py \
--colbert_index_path experiments/$(domain)/indexes/$(domain).2bits \
--colbert_checkpoint workdir/colbert_model \
--colbert_collection_path stackexchange/PyStack/dataset/$(domain)/$(domain)_collection.tsv
demo-domain:
python chat_interactive.py \
$(PIPELINE_FLAGS) \
--output_file data/$(domain)_demo.txt
start-backend-domain:
python server_api_domain_bot.py \
$(PIPELINE_FLAGS) \
# --test
# --no_logging