-
Notifications
You must be signed in to change notification settings - Fork 4
/
build_wiki_index.py
71 lines (64 loc) · 2.34 KB
/
build_wiki_index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# This script is modified from https://github.com/oneal2000/DRAGIN/blob/main/prep_elastic.py
# Original code is available under the repository DRAGIN by oneal2000.
# Modifications were made by Weijian QI.
from typing import List, Tuple, Union, Dict
import argparse
import glob
import time
import csv
import json
import logging
from tqdm import tqdm
from beir.datasets.data_loader import GenericDataLoader
def build_elasticsearch(
beir_corpus_file_pattern: str,
index_name: str,
port: int
):
beir_corpus_files = glob.glob(beir_corpus_file_pattern)
print(f'#files {len(beir_corpus_files)}')
from beir.retrieval.search.lexical.elastic_search import ElasticSearch
config = {
'hostname': {"host": "localhost", "port": port},
'index_name': index_name,
'keys': {'title': 'title', 'body': 'txt'},
'timeout': 100,
'retry_on_timeout': True,
'maxsize': 24,
'number_of_shards': 'default',
'language': 'english',
}
es = ElasticSearch(config)
# create index
print(f'create index {index_name}')
es.delete_index()
time.sleep(5)
es.create_index()
# generator
def generate_actions():
for beir_corpus_file in beir_corpus_files:
with open(beir_corpus_file, 'r') as fin:
reader = csv.reader(fin, delimiter='\t')
header = next(reader) # skip header
for row in reader:
_id, text, title = row[0], row[1], row[2]
es_doc = {
'_id': _id,
'_op_type': 'index',
'refresh': 'wait_for',
config['keys']['title']: title,
config['keys']['body']: text,
}
yield es_doc
# index
progress = tqdm(unit='docs')
es.bulk_add_to_index(
generate_actions=generate_actions(),
progress=progress)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--data_path', type=str, default=None, help='input file')
parser.add_argument("--index_name", type=str, default=None, help="index name")
parser.add_argument("--port", type=str, default=None, help="index name")
args = parser.parse_args()
build_elasticsearch(args.data_path, index_name=args.index_name, port=args.port)