-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathinference-demo.py
44 lines (38 loc) · 1.65 KB
/
inference-demo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import argparse
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--model",
type=str,
default="EleutherAI/gpt-neo-1.3B",
help="The pre-trained model from Hugging Face to use as basis: "
"https://huggingface.co/models"
)
parser.add_argument(
"--prompt",
type=str,
default="The movie about how AI will take over the world was great because",
help="Prompt for the LLM to continue"
)
args = parser.parse_args()
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f"Using device {device}")
if device.type == 'cuda':
print(f"Device name is {torch.cuda.get_device_name(device)}")
tokenizer = AutoTokenizer.from_pretrained(args.model, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(args.model)
model.to(device)
with torch.no_grad():
inputs = tokenizer(args.prompt, return_tensors='pt').to(device)
outputs = model.generate(**inputs, do_sample=True, max_length=80, num_return_sequences=4)
decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
print('Sample generated reviews:')
for i, txt in enumerate(decoded_outputs):
print("#######################")
print(f"{i+1}: {txt}")
# for device_id in range(torch.cuda.device_count()):
# print(f"- GPU {device_id} max memory allocated: "
# f"{torch.cuda.max_memory_allocated(device_id)/1024/1024:.2f}MB")