forked from rasbt/LLMs-from-scratch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhparam_search.py
215 lines (170 loc) · 7.71 KB
/
hparam_search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt).
# Source for "Build a Large Language Model From Scratch"
# - https://www.manning.com/books/build-a-large-language-model-from-scratch
# Code: https://github.com/rasbt/LLMs-from-scratch
import itertools
import math
import os
import tiktoken
import torch
from previous_chapters import GPTModel, create_dataloader_v1
# Define a grid of hyperparameters to search over
HPARAM_GRID = {
"batch_size": [2, 4, 8, 16],
"drop_rate": [0.0, 0.1, 0.2],
"warmup_iters": [10, 20, 30],
"weight_decay": [0.1, 0.01, 0.0],
"peak_lr": [0.0001, 0.0005, 0.001, 0.005],
"initial_lr": [0.00005, 0.0001],
"min_lr": [0.00005, 0.00001, 0.0001],
"n_epochs": [5, 10, 15, 20, 25],
}
def calc_loss_loader(data_loader, model, device, num_batches=None):
total_loss = 0.
if len(data_loader) == 0:
return float("nan")
elif num_batches is None:
num_batches = len(data_loader)
else:
num_batches = min(num_batches, len(data_loader))
for i, (input_batch, target_batch) in enumerate(data_loader):
if i < num_batches:
loss = calc_loss_batch(input_batch, target_batch, model, device)
total_loss += loss.item()
else:
break
return total_loss / num_batches
def calc_loss_batch(input_batch, target_batch, model, device):
input_batch, target_batch = input_batch.to(device), target_batch.to(device)
logits = model(input_batch)
logits = logits.view(-1, logits.size(-1))
loss = torch.nn.functional.cross_entropy(logits, target_batch.view(-1))
return loss
def evaluate_model(model, train_loader, val_loader, device, eval_iter):
model.eval()
with torch.no_grad():
train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)
val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)
model.train()
return train_loss, val_loss
def train_model(model, train_loader, val_loader, optimizer, device,
n_epochs, eval_freq, eval_iter,
encoded_start_context, tokenizer, warmup_iters=10,
initial_lr=3e-05, min_lr=1e-6):
global_step = 0
max_lr = optimizer.param_groups[0]["lr"]
# Calculate total number of iterations
total_training_iters = len(train_loader) * n_epochs
# Calculate the learning rate increment at each step during warmup
lr_increment = (optimizer.param_groups[0]["lr"] - initial_lr) / warmup_iters
for epoch in range(n_epochs):
model.train()
for input_batch, target_batch in train_loader:
optimizer.zero_grad()
# Increment the global step at the beginning of the iteration
global_step += 1
# Warmup: adjust learning rate linearly
if global_step < warmup_iters:
lr = initial_lr + global_step * lr_increment
# Cosine annealing phase
else:
progress = (global_step - warmup_iters) / (total_training_iters - warmup_iters)
lr = min_lr + (max_lr - min_lr) * 0.5 * (1 + math.cos(math.pi * progress))
# Apply the calculated learning rate
for param_group in optimizer.param_groups:
param_group["lr"] = lr
loss = calc_loss_batch(input_batch, target_batch, model, device)
loss.backward()
# Apply gradient clipping
if global_step >= warmup_iters:
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
optimizer.step()
train_loss, val_loss = evaluate_model(model, train_loader, val_loader, device, eval_iter)
return train_loss, val_loss
if __name__ == "__main__":
# Generate all combinations of hyperparameters
hyperparameter_combinations = list(itertools.product(*HPARAM_GRID.values()))
total_combinations = len(hyperparameter_combinations)
print(f"Total hyperparameter configurations: {total_combinations}")
# Placeholder for the best loss and best hyperparameters
best_val_loss = float('inf')
best_hparams = {}
script_path = os.path.abspath(__file__)
script_dir = os.path.dirname(script_path)
with open(os.path.join(script_dir, "the-verdict.txt"), "r", encoding="utf-8") as file:
text_data = file.read()
tokenizer = tiktoken.get_encoding("gpt2")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_ratio = 0.95
split_idx = int(train_ratio * len(text_data))
torch.manual_seed(123)
interrupted = False
current_config = 0
for combination in hyperparameter_combinations:
try:
current_config += 1
print(f"Evaluating configuration {current_config} of {total_combinations}")
# Unpack the current combination of hyperparameters
HPARAM_CONFIG = dict(zip(HPARAM_GRID.keys(), combination))
GPT_CONFIG_124M = {
"vocab_size": 50257, # Vocabulary size
"context_length": 256, # Context length -- shortened from original 1024 tokens
"emb_dim": 768, # Embedding dimension
"n_heads": 12, # Number of attention heads
"n_layers": 12, # Number of layers
"drop_rate": HPARAM_CONFIG["drop_rate"],
"qkv_bias": False, # Query-Key-Value bias
}
torch.manual_seed(123)
train_loader = create_dataloader_v1(
text_data[:split_idx],
batch_size=HPARAM_CONFIG["batch_size"],
max_length=GPT_CONFIG_124M["context_length"],
stride=GPT_CONFIG_124M["context_length"],
drop_last=True,
shuffle=True,
num_workers=0
)
val_loader = create_dataloader_v1(
text_data[split_idx:],
batch_size=HPARAM_CONFIG["batch_size"],
max_length=GPT_CONFIG_124M["context_length"],
stride=GPT_CONFIG_124M["context_length"],
drop_last=False,
shuffle=False,
num_workers=0
)
model = GPTModel(GPT_CONFIG_124M)
model.to(device)
optimizer = torch.optim.AdamW(
model.parameters(),
lr=HPARAM_CONFIG["peak_lr"],
weight_decay=HPARAM_CONFIG["weight_decay"]
)
encoded_start_context = tokenizer.encode("Nevertheless")
encoded_tensor = torch.tensor(encoded_start_context).unsqueeze(0)
train_loss, val_loss = train_model(
model, train_loader, val_loader, optimizer, device,
n_epochs=HPARAM_CONFIG["n_epochs"],
eval_freq=5, eval_iter=1,
encoded_start_context=encoded_tensor,
tokenizer=tokenizer,
warmup_iters=HPARAM_CONFIG["warmup_iters"],
initial_lr=HPARAM_CONFIG["initial_lr"],
min_lr=HPARAM_CONFIG["min_lr"]
)
# Log the best hyperparameters based on validation loss
if val_loss < best_val_loss:
best_val_loss = val_loss
best_train_loss = train_loss
best_hparams = HPARAM_CONFIG
except KeyboardInterrupt:
print("Hyperparameter search completed.")
print(f"Best hyperparameters: {best_hparams}")
print(f"Best Val loss: {best_val_loss} | Training loss {train_loss}")
interrupted = True
break
if not interrupted:
print("Hyperparameter search completed.")
print(f"Best hyperparameters: {best_hparams}")
print(f"Best Val loss: {best_val_loss} | Training loss {train_loss}")