-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest.py
118 lines (87 loc) · 3.45 KB
/
test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#!/usr/bin/env python
# coding: utf-8
import torch
import torch.nn as nn
from torchtext.datasets import IMDB
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import spacy
SEED = 1234
USE_GPU = False
torch.manual_seed(SEED)
tokenizer = get_tokenizer('spacy', language="en_core_web_sm")
device = torch.device('mps' if torch.backends.mps.is_available() and USE_GPU == True else 'cpu')
print(device)
train_iter, test_iter = IMDB(split=('train', 'test'))
train_list = list(train_iter)
test_list = list(test_iter)
def yield_tokens(data_iter):
for _, text in data_iter:
yield tokenizer(text)
# unk_token = '<unk>'
pad_token = '<pad>'
special_tokens = [pad_token]
# Counter syntax and build_vocab_from_iterator is same . build_vocab_from_iterator uses counter inside the pytorch.
vocabulary = build_vocab_from_iterator(yield_tokens(train_iter), specials=special_tokens)
vocabulary.set_default_index(vocabulary[pad_token])
# unk_index = vocab[unk_token]
pad_index = vocabulary[pad_token]
new_stoi = vocabulary.get_stoi()
new_itos = vocabulary.get_itos()
# sentimentMap = {"pos": 0, "neg": 1}
# sentimentMap = {1: "neg", 2: "pos"}
sentimentMap = {1: 0, 2: 1}
text_transform = lambda x: vocabulary(tokenizer(x))
class RNN(nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers,
bidirectional, dropout, pad_idx):
super().__init__()
self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
self.rnn = nn.LSTM(embedding_dim,
hidden_dim,
num_layers=n_layers,
bidirectional=bidirectional,
dropout=dropout)
self.fc = nn.Linear(hidden_dim * 2, output_dim)
self.dropout = nn.Dropout(dropout)
def forward(self, text, text_lengths):
embedded = self.dropout(self.embedding(text))
packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.to('cpu'), enforce_sorted=False)
packed_output, (hidden, cell) = self.rnn(packed_embedded)
output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)
hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
return self.fc(hidden)
# BUILDING MODEL
INPUT_DIM = len(vocabulary)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
BATCH_SIZE = 64
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
PAD_IDX = new_stoi[pad_token]
model = RNN(INPUT_DIM,
EMBEDDING_DIM,
HIDDEN_DIM,
OUTPUT_DIM,
N_LAYERS,
BIDIRECTIONAL,
DROPOUT,
PAD_IDX)
def loadModel(model,filePath):
model.load_state_dict(torch.load(filePath, map_location=torch.device(device)))
loadModel(model, 'tut1-model.pt')
nlp = spacy.load('en_core_web_sm')
def predict_sentiment(model, sentence):
model.eval()
tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
indexed = [new_stoi[t] for t in tokenized]
length = [len(indexed)]
tensor = torch.LongTensor(indexed).to(device)
tensor = tensor.unsqueeze(1)
length_tensor = torch.LongTensor(length)
prediction = torch.sigmoid(model(tensor, length_tensor))
return prediction.item()
print(predict_sentiment(model, "This film is terrible"))
print(predict_sentiment(model, "This film is very good"))