-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtraining_model.py
95 lines (74 loc) · 2.6 KB
/
training_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import random
import json
import pickle
import nltk
from nltk.stem import WordNetLemmatizer
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import SGD
lemmatizer = WordNetLemmatizer()
words = [] # set of words in the dataset intents.json
diseases = [] # diseases in responses.json
documents = [] # list(words in pattern, corresponding tag)
classes = [] # intent classes from intents.json
ignore_words = ["?", "!", "XXXX", "disease"]
file = open("data/responses.json", "r", encoding="UTF-8")
data = json.load(file)
file.close()
for disease in data["responses"]:
d = disease["tag"]
diseases.append(d)
file = open("data/intents.json", "r", encoding="UTF-8")
intents = json.load(file)
file.close()
for intent in intents["intents"]:
for pattern in intent["patterns"]:
w = nltk.word_tokenize(pattern)
words.extend(w)
documents.append((w, intent["tag"]))
if intent["tag"] not in classes:
classes.append(intent["tag"])
# pre-process and organise the data
words = [lemmatizer.lemmatize(w.lower()) for w in words if w not in ignore_words]
words = sorted(list(set(words)))
classes = sorted(list(set(classes)))
diseases = sorted(list(set(diseases)))
print(len(documents), " documents")
# print("Documents: ", documents)
print(len(words), "unique lemmatized words", words)
print("diseases: ", diseases)
print("classes: ", classes)
print("")
pickle.dump(words, open("data/words.pkl", "wb"))
pickle.dump(diseases, open("data/diseases.pkl", "wb"))
pickle.dump(classes, open("data/classes.pkl", "wb"))
training = []
output_empty = [0] * len(classes)
for doc in documents:
bag = []
pattern_words = doc[0]
for w in words:
if w in pattern_words:
bag.append(1)
else:
bag.append(0)
output_row = list(output_empty)
output_row[classes.index(doc[1])] = 1
training.append([bag, output_row])
random.shuffle(training)
train_x = np.array([i[0] for i in training])
train_y = np.array([i[1] for i in training])
model = Sequential()
model.add(Dense(128, input_shape=(len(train_x[0]),), activation="relu"))
model.add(Dropout(0.5))
model.add(Dense(64, activation="relu"))
model.add(Dropout(0.5))
model.add(Dense(len(train_y[0]), activation="softmax"))
sgd = SGD(learning_rate=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss="categorical_crossentropy", optimizer=sgd, metrics=["accuracy"])
HIST = model.fit(
np.array(train_x), np.array(train_y), epochs=500, batch_size=5, verbose=1
)
model.save("models/intent_classification.h5", HIST)
print("model trained and saved")