-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdict_hub.py
75 lines (52 loc) · 1.78 KB
/
dict_hub.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import os
import glob
from transformers import AutoTokenizer
from config import args
from triplet import TripletDict, EntityDict, LinkGraph
from logger_config import logger
import ipdb
train_triplet_dict: TripletDict = None
all_triplet_dict: TripletDict = None
link_graph: LinkGraph = None
entity_dict: EntityDict = None
tokenizer: AutoTokenizer = None
def _init_entity_dict():
global entity_dict
if not entity_dict:
entity_dict = EntityDict(entity_dict_dir=os.path.dirname(args.valid_path))
def _init_train_triplet_dict():
# ipdb.set_trace()
global train_triplet_dict
if not train_triplet_dict:
train_triplet_dict = TripletDict(path_list=[args.train_path])
def _init_all_triplet_dict():
global all_triplet_dict
if not all_triplet_dict:
path_pattern = '{}/*.txt.json'.format(os.path.dirname(args.train_path))
all_triplet_dict = TripletDict(path_list=glob.glob(path_pattern))
def _init_link_graph():
global link_graph
if not link_graph:
link_graph = LinkGraph(train_path=args.train_path)
def get_entity_dict():
_init_entity_dict()
return entity_dict
def get_train_triplet_dict():
_init_train_triplet_dict()
return train_triplet_dict
def get_all_triplet_dict():
_init_all_triplet_dict()
return all_triplet_dict
def get_link_graph():
_init_link_graph()
return link_graph
def build_tokenizer(args):
global tokenizer
if tokenizer is None:
# tokenizer = AutoTokenizer.from_pretrained(args.pretrained_model, local_files_only=True)
tokenizer = AutoTokenizer.from_pretrained(args.pretrained_model)
logger.info('Build tokenizer from {}'.format(args.pretrained_model))
def get_tokenizer():
if tokenizer is None:
build_tokenizer(args)
return tokenizer