-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtransformerFeatures.py
83 lines (76 loc) · 3.21 KB
/
transformerFeatures.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import pandas as pd
import numpy as np
import tensorflow as tf
import transformers
import re
class TEmbedding:
# Refer: https://huggingface.co/transformers/pretrained_models.html
allowed_models = [
'bert-base-uncased',
'bert-large-uncased',
'bert-base-cased',
'bert-large-cased',
'gpt2',
'gpt2-medium',
'gpt2-large', # Use it with caution
'gpt2-xl', # Use it with caution
'roberta-base',
'roberta-large',
'distilbert-base-uncased',
'distilbert-base-cased',
'albert-base-v2',
'albert-large-v2',
'albert-xlarge-v2'
]
def __init__(self, model, backend):
self.ids = None
if model not in TEmbedding.allowed_models:
raise NotImplementedError(f'The given model {model} is not supported!!')
else:
if re.match(r'^bert-.*', model):
from transformers import BertTokenizer, TFBertModel
self._tokenizer = BertTokenizer.from_pretrained(model)
self.transformer = TFBertModel.from_pretrained(model)
elif re.match(r'^gpt2.*', model):
from transformers import GPT2Tokenizer, TFGPT2Model
self._tokenizer = GPT2Tokenizer.from_pretrained(model)
self.transformer = TFGPT2Model.from_pretrained(model)
elif re.match(r'^distilbert-.*', model):
from transformers import DistilBertTokenizer, TFDistilBertModel
self._tokenizer = DistilBertTokenizer.from_pretrained(model)
self.transformer = TFDistilBertModel.from_pretrained(model)
elif re.match(r'^roberta-.*', model):
from transformers import RobertaTokenizer, TFRobertaModel
self._tokenizer = RobertaTokenizer.from_pretrained(model)
self.transformer = TFRobertaModel.from_pretrained(model)
elif re.match(r'^albert-.*', model):
from transformers import AlbertTokenizer, TFAlbertModel
self._tokenizer = AlbertTokenizer.from_pretrained(model)
self.transformer = TFAlbertModel.from_pretrained(model)
self.backend = backend
def get_embedding(self, text, device):
if self.backend.lower() == 'tensorflow':
self.ids = self.tokenizer.encode(text)
input_ = tf.constant(self.ids)[None, :]
return self.transformer(input_)
def sent2vec(self, text, criteria='avg', device='cpu'):
em = self.get_embedding(text, device)[0]
em = em.numpy()
n = em.shape[1] - 1
if criteria.lower() == 'avg':
# Remove the embedding for the tokens [CLS] [SEP]
em = em[:, 1:n, :].squeeze(axis=0)
return np.mean(em, axis=0)
else:
raise NotImplementedError(f'Criteria: {criteria} is not implemented!!')
@property
def tokenizer(self):
return self._tokenizer
@property
def token_ids(self):
return self.ids
if __name__ == '__main__':
text = "Hello, my dog is so cute!"
te = TEmbedding(model='distilbert-base-uncased', backend='tensorflow')
a = te.sent2vec('Hello, my dog is very cute!')
print(a.shape)