forked from ramiro050/lazy-tensor-samples
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlazytensor_bert_example.py
106 lines (80 loc) · 3.44 KB
/
lazytensor_bert_example.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
"""
Runs a training of the Bert model using the Lazy Tensor Core with the
TorchScript backend.
Requirements to run example:
- python -m pip install transformers datasets
- `lazy_tensor_core` Python package
For information on how to obtain the `lazy_tensor_core` Python package,
see here:
https://github.com/pytorch/pytorch/blob/lazy_tensor_staging/lazy_tensor_core/QUICKSTART.md
To run the example, make sure `/path/to/pytorch/lazy_tensor_core` is in your
PYTHONPATH. Then, run
python lazytensor_bert_example.py
The output of this example can be found in
`lazytensor_bert_example_output.txt`
Most of the code in this example was copied from the wonderful tutorial
https://huggingface.co/transformers/training.html#fine-tuning-in-native-pytorch
"""
from typing import List
import torch
from torch.utils.data import DataLoader
from transformers import BertForSequenceClassification, \
BertTokenizer, AdamW, get_scheduler
from datasets import load_dataset
from datasets.dataset_dict import DatasetDict
import lazy_tensor_core as ltc
from lazy_tensor_core.debug import metrics
def tokenize_dataset(dataset: DatasetDict) -> DatasetDict:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
def tokenize_function(examples):
return tokenizer(examples["text"], padding="max_length",
truncation=True)
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['text'])
tokenized_datasets = tokenized_datasets.rename_column('label', 'labels')
tokenized_datasets.set_format('torch')
return tokenized_datasets
def train(model: BertForSequenceClassification,
num_epochs: int,
num_training_steps: int,
train_dataloader: DataLoader,
device: torch.device) -> List[torch.Tensor]:
optimizer = AdamW(model.parameters(), lr=5e-5)
lr_scheduler = get_scheduler('linear', optimizer=optimizer,
num_warmup_steps=0,
num_training_steps=num_training_steps)
model.train()
losses = []
for _ in range(num_epochs):
for batch in train_dataloader:
batch = {k: v.to(device) for k, v in batch.items()}
outputs = model(**batch)
loss = outputs.loss
loss.backward()
losses.append(loss)
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()
return losses
def main():
ltc._LAZYC._ltc_init_ts_backend()
device = torch.device('lazy')
tokenized_datasets = tokenize_dataset(load_dataset('imdb'))
small_train_dataset = tokenized_datasets['train'].shuffle(seed=42)\
.select(range(2))
train_dataloader = DataLoader(small_train_dataset, shuffle=True,
batch_size=8)
model = BertForSequenceClassification.from_pretrained('bert-base-cased',
num_labels=2)
model.to(device)
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
losses = train(model, num_epochs,
num_training_steps, train_dataloader, device)
print('\nMetrics report:')
print(metrics.metrics_report())
print('\nTorchScriptGraph:')
graph_str = ltc._LAZYC._get_ltc_tensors_backend([losses[0]])
print(graph_str)
if __name__ == '__main__':
main()