This repository was archived by the owner on Jun 14, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
49 lines (39 loc) · 1.59 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
## malek abid dec 4th 2022
## last edit april 30 2023
## onelook thesaurus revdic model data pre-processing
import sys
sys.path.append("./code/utils")
from data_process import *
file_types = ["train", "dev", "seen", "unseen", "desc"]
def process_data(ftype):
print("Processing data:")
file = f"./data/wwdata/{ftype}.json"
out_file = f"./data/wwdata/{ftype}_stripped.json"
process_tsinghua_data(file, out_file)
print("Linguistic annotations successfully stripped")
# file = f"./data/wwdata/{ftype}_stripped.json"
# out_file = f"./data/wwdata/{ftype}_ft.json"
# add_fasttext_embedding(file, out_file)
# print("Fasttext embeddings completed")
file = f"./data/wwdata/{ftype}_stripped.json"
out_file = f"./data/wwdata/{ftype}_w2v.json"
add_word2vec_embedding(file, out_file, ftype)
print("Word2vec embeddings completed.") # w2v works best for revdict
# file = f"./data/wwdata/{ftype}_stripped.json"
# out_file = f"./data/wwdata/{ftype}_bert.json"
# add_bert_embedding_from_def(file, out_file, ftype)
# print("BERT embeddings completed.") # BERT for definition modelling
file = f"./data/wwdata/{ftype}_w2v.json"
out_file = f"./data/wwdata/{ftype}_label_list.json"
filter_hill_data(file, out_file)
print("Duplicates cleaned successfully")
try:
check_data(out_file)
except:
print("Some errors^")
print("Complete.")
# process_data(file_types[0]) # train
# process_data(file_types[1]) # dev
# process_data(file_types[2]) # seen
# process_data(file_types[3]) # unseen
# process_data(file_types[4]) # human desc