-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathreshape_bert_for_npu.py
74 lines (60 loc) · 2.44 KB
/
reshape_bert_for_npu.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
from melo.api import TTS
from pathlib import Path
from openvino.runtime import Core
from openvino import Type
import openvino as ov
import time
from transformers import AutoTokenizer
import torch
device = 'CPU' # or cuda:0
language = 'ZH'
ov_path = f"/tts_ov_{language}"
ov_model_path = Path(f"{ov_path}/bert_int8.xml")
ov_model_save_path = Path(f"{ov_path}/bert_static_int8.xml")
bert_static_shape = [1,64]
def reshape_for_npu(model, bert_static_shape):
# change dynamic shape to static shape
shapes = dict()
for input_layer in model.inputs:
shapes[input_layer] = bert_static_shape
model.reshape(shapes)
ov.save_model(model, Path(ov_model_save_path))
print(f"save static model in {Path(ov_model_save_path)}")
def pad_input(input_dict, pad_length=64):
def pad_tensor(input_tensor, pad_length):
pad_size = pad_length - input_tensor.shape[1]
if pad_size > 0:
# Pad with zeros on the right side using torch.nn.functional.pad
return torch.nn.functional.pad(input_tensor, (0, pad_size), 'constant', 0)
elif pad_size < 0:
# Truncate the input tensor to the specified pad_length
return input_tensor[:, :pad_length]
else:
return input_tensor
padded_inputs = {}
for key, value in input_dict.items():
padded_inputs[key] = pad_tensor(value, pad_length)
return padded_inputs
def test_static_shape(compiled_model,device="NPU"):
if "ZH" in language:
model_id='bert-base-multilingual-uncased'
text = "buffer是一个数据容器,可以从device和host访问。"
elif "EN" in language:
model_id='bert-base-uncased'
text = "A buffer is a container for data that can be accessed from a device and the host."
tokenizers = AutoTokenizer.from_pretrained(model_id)
inputs = tokenizers(text, return_tensors="pt")
padded_inputs = pad_input(inputs,pad_length=64)
infer_request = compiled_model.create_infer_request()
infer_request.infer(padded_inputs)
res = infer_request.get_tensor("hidden_states").data.copy()
#print(res)
pass
def main():
core = Core()
model = core.read_model(ov_model_path)
reshape_for_npu(model, bert_static_shape=bert_static_shape)
compiled_model = core.compile_model(ov_model_save_path,device)
test_static_shape(compiled_model, "NPU")
if __name__ == "__main__":
main()