-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
75 lines (58 loc) · 2.13 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import torch
from torch.utils.data import DataLoader
from embedding import BERTEmbedding
from dataset import BERTDataset
from transformers import BertTokenizer
from bert_tokenizer import get_bert_tokenizer
from bert import BERT
from pretraining import BERTLM, BERTTrainer
def get_data():
corpus_movie_conv = './datasets/movie_conversations.txt'
corpus_movie_lines = './datasets/movie_lines.txt'
with open(corpus_movie_conv, "r", encoding = "iso-8859-1") as c:
conv = c.readlines()
with open(corpus_movie_lines, "r", encoding = "iso-8859-1") as l:
lines = l.readlines()
lines_dic = {}
for line in lines:
objects = line.split(" +++$+++ ")
lines_dic[objects[0]] = objects[-1]
pairs = []
for con in conv:
ids = eval(con.split(" +++$+++ ")[-1])
for i in range(len(ids) - 1):
qa_pairs = []
first = lines_dic[ids[i]].strip()
second = lines_dic[ids[i+1]].strip()
qa_pairs.append(first)
qa_pairs.append(second)
pairs.append(qa_pairs)
return pairs
if __name__ == "__main__":
context_window = 64
data = get_data()
MAX_LEN = 64
try:
tokenizer = BertTokenizer.from_pretrained("./bert-it-1/bert-it-vocab.txt", local_files_only = True)
except FileNotFoundError:
get_bert_tokenizer(data)
tokenizer = BertTokenizer.from_pretrained("./bert-it-1/bert-it-vocab.txt", local_files_only = True)
vocab_size = tokenizer.vocab_size
embedding_size = 64
train_data = BERTDataset(
data, context_window = MAX_LEN, tokenizer = tokenizer
)
train_loader = DataLoader(
train_data, batch_size = 32, shuffle = True, pin_memory = True)
bert_model = BERT(
vocab_size = len(tokenizer.vocab),
d_model = 64,
n_layers = 1,
n_heads = 2,
dropout = 0.1)
bert_lm = BERTLM(bert_model, len(tokenizer.vocab))
device = "cuda" if torch.cuda.is_available() else "cpu"
bert_trainer = BERTTrainer(bert_lm , train_loader, device = device)
epochs = 1
for epoch in range(epochs):
bert_trainer.train(epoch)