-
Notifications
You must be signed in to change notification settings - Fork 0
/
llama.lua
300 lines (252 loc) · 9.49 KB
/
llama.lua
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
local backend, model_path, prompt = ...
require("debug.luajit_options").SetOptimized()
local profiler = require("debug.profiler")
local get_time = require("debug.get_time")
local measure = require("debug.measure")
local gguf = require("gguf")
local Tokenizer = require("tokenizer")
local Tensor = require("tensor")
Tensor:UseComputeKernel(backend)
local Sampler = require("topp_sampler")
local function load_and_run(model_path, prompt, token_callback)
local context_length = 512
local temperature = 0.1
local topp = 0.95
local max_tokens = math.huge
local seed = 1337
local metadata, tensors = gguf.load(model_path)
assert(metadata["tokenizer.ggml.model"] == "gpt2")
assert(metadata["tokenizer.ggml.tokens"])
assert(metadata["tokenizer.ggml.merges"])
do
local dim = metadata["llama.embedding_length"]
local hidden_dim = metadata["llama.feed_forward_length"]
local number_of_layers = metadata["llama.block_count"]
local number_of_heads = metadata["llama.attention.head_count"]
local number_of_key_value_heads = metadata["llama.attention.head_count_kv"] or
metadata["llama.attention.head_count"]
local vocabulary_size = #metadata["tokenizer.ggml.tokens"]
local rms_norm_eps = metadata["llama.attention.layer_norm_rms_epsilon"] or 1e-5
local head_size = dim / number_of_heads
local sqrt_head_size = math.sqrt(head_size)
local kv_dim = (dim * number_of_key_value_heads) / number_of_heads
local kv_mul = number_of_heads / number_of_key_value_heads
local rope_theta = metadata["llama.rope.freq_base"] or 10000
local function build_rope_freqs(context_length, rope_theta)
rope_theta = rope_theta or 10000
assert(head_size % 2 == 0)
local cr = {}
local ci = {}
local n = 1
for pos = 0, context_length - 1 do
for i = 0, head_size - 1, 2 do
local freq = 1.0 / (rope_theta ^ (i / head_size))
local val = pos * freq
cr[n] = math.cos(val)
ci[n] = math.sin(val)
n = n + 1
end
end
n = n - 1
assert(context_length * (head_size / 2) == n)
return cr, ci
end
local rope_freqs_real, rope_freqs_imag = build_rope_freqs(context_length, rope_theta)
local floor = math.floor
local function forward(state, weights)
weights.token_embedding_table:CopyTo(state.token * dim, state.x, 0, dim)
for i, layer in ipairs(weights.layers) do
local key_cache = state.key_cache[i]
local val_cache = state.val_cache[i]
state.xb:RmsNormInPlace(state.x, layer.rms_att_weight, dim, rms_norm_eps)
layer.wq:MatrixVectorMultiply(state.xb, state.q, dim, dim)
layer.wk:MatrixVectorMultiply(state.xb, state.k, kv_dim, dim)
layer.wv:MatrixVectorMultiply(state.xb, state.v, kv_dim, dim)
for i = 0, dim - 1, 2 do
local offset = 1 + (state.token_pos * (head_size / 2) + ((i % head_size) / 2))
local fcr = rope_freqs_real[offset]
local fci = rope_freqs_imag[offset]
local rotn = i < kv_dim and 2 or 1
for v = 0, rotn - 1 do
local vec = v == 0 and state.q or state.k
local v0 = vec:GetFloat(i)
local v1 = vec:GetFloat(i + 1)
vec:SetFloat(i, v0 * fcr - v1 * fci)
vec:SetFloat(i + 1, v0 * fci + v1 * fcr)
end
end
state.k:CopyTo(0, key_cache, state.token_pos * kv_dim, kv_dim)
state.v:CopyTo(0, val_cache, state.token_pos * kv_dim, kv_dim)
for h = 0, number_of_heads - 1 do
local qOffset = h * head_size
local attOffset = h * context_length
local key_cache_offset = floor(h / kv_mul) * head_size
for t = 0, state.token_pos do
state.att:SetFloat(
attOffset + t,
state.q:Dot(qOffset, key_cache, t * kv_dim + key_cache_offset, head_size) / sqrt_head_size
)
end
state.att:SoftMaxInPlace(attOffset, state.token_pos + 1)
state.xb:FillInPlace(qOffset, head_size, 0)
for t = 0, state.token_pos do
state.xb:SaxpyInPlace(
qOffset,
val_cache,
t * kv_dim + key_cache_offset,
head_size,
state.att:GetFloat(attOffset + t)
)
end
end
layer.wo:MatrixVectorMultiply(state.xb, state.xb2, dim, dim)
state.x:AddTensorInPlace(state.xb2)
state.xb:RmsNormInPlace(state.x, layer.rms_ffn_weight, dim, rms_norm_eps)
layer.w1:MatrixVectorMultiply(state.xb, state.hb, hidden_dim, dim)
layer.w3:MatrixVectorMultiply(state.xb, state.hb2, hidden_dim, dim)
state.hb:SigmoidInPlace()
state.hb:MultiplyTensorInPlace(state.hb2)
layer.w2:MatrixVectorMultiply(state.hb, state.xb, dim, hidden_dim)
state.x:AddTensorInPlace(state.xb)
end
state.x:RmsNormInPlace(state.x, weights.rms_final_weight, dim, rms_norm_eps)
weights.wcls:MatrixVectorMultiply(state.x, state.logits, vocabulary_size, dim)
end
local tokenizer = Tokenizer:new(metadata["tokenizer.ggml.tokens"], metadata["tokenizer.ggml.merges"])
local sampler = Sampler:new(vocabulary_size, temperature, topp)
local prompt_tokens = tokenizer:EncodeString(prompt)
local weights = {layers = {}}
do
local function load_tensor(name)
local entry = tensors[name]
if Tensor[entry.type_info.name] then
return Tensor[entry.type_info.name](Tensor, entry.size, entry.blob):SetName(name .. "[" .. entry.type_info.name .. "]")
end
error("NYI tensor type: " .. entry.type_info.name)
end
weights.token_embedding_table = load_tensor("token_embd.weight")
for i = 0, number_of_layers - 1 do
weights.layers[i + 1] = {
rms_att_weight = load_tensor("blk." .. i .. ".attn_norm.weight"),
wq = load_tensor("blk." .. i .. ".attn_q.weight"),
wk = load_tensor("blk." .. i .. ".attn_k.weight"),
wv = load_tensor("blk." .. i .. ".attn_v.weight"),
wo = load_tensor("blk." .. i .. ".attn_output.weight"),
rms_ffn_weight = load_tensor("blk." .. i .. ".ffn_norm.weight"),
w1 = load_tensor("blk." .. i .. ".ffn_gate.weight"),
w2 = load_tensor("blk." .. i .. ".ffn_down.weight"),
w3 = load_tensor("blk." .. i .. ".ffn_up.weight"),
}
end
weights.rms_final_weight = load_tensor("output_norm.weight")
weights.wcls = load_tensor("output.weight")
end
local state = {
token = tokenizer:EncodeString("<|begin_of_text|>")[1] - 1,
token_pos = 0,
x = Tensor:F32(dim),
xb = Tensor:F32(dim),
xb2 = Tensor:F32(dim),
hb = Tensor:F32(hidden_dim),
hb2 = Tensor:F32(hidden_dim),
q = Tensor:F32(dim),
k = Tensor:F32(dim),
v = Tensor:F32(dim),
att = Tensor:F32(number_of_heads * context_length),
logits = Tensor:F32(vocabulary_size),
key_cache = {},
val_cache = {},
}
for i = 1, number_of_layers do
state.key_cache[i] = Tensor:F32(context_length * kv_dim)
state.val_cache[i] = Tensor:F32(context_length * kv_dim)
end
if backend == "cuda" then
-- upload and preallocate tensor memory for better performance and vram usage
local total_size = 0
local gpu = require("compute.gpu_cuda")
measure("uploading tensors to gpu")
local size_map = {}
for _, tensor in ipairs(Tensor.GetAll()) do
if tensor.name and tensor.name:find(".weight") then
-- weight tensors are static
tensor.blob.gpu_ptr = gpu.allocate_on_device(tensor.blob.byte_size, tensor.blob.blob)
total_size = total_size + tensor.blob.byte_size
else
-- state tensors are dynamic and are uploaded on each Tensor.MatrixVectorMultiply call
-- so we can allocate and share memory for each byte size
size_map[tensor.blob.byte_size] = size_map[tensor.blob.byte_size] or {}
table.insert(size_map[tensor.blob.byte_size], tensor)
end
end
for byte_size, tensors in pairs(size_map) do
local gpu_ptr = gpu.allocate_on_device(byte_size)
for _, tensor in ipairs(tensors) do
tensor.blob.gpu_ptr = gpu_ptr
end
total_size = total_size + byte_size
end
measure()
print(string.format("%.2fgb tensors allocated on GPU", total_size / 1024 / 1024 / 1024))
gpu.dump_gpu_stats()
end
do
local total_size = 0
for _, tensor in ipairs(Tensor.GetAll()) do
total_size = total_size + tensor.blob.byte_size
end
print(string.format("%.2fgb tensors allocated on CPU", total_size / 1024 / 1024 / 1024))
end
profiler.Start()
local total_time = 0
print("\n\n\n")
math.randomseed(seed)
while state.token_pos < max_tokens do
local start_time = get_time()
forward(state, weights)
local next_token
if state.token_pos < #prompt_tokens then
next_token = prompt_tokens[state.token_pos + 1]
else
next_token = sampler:SampleToken(state.logits) + 1
end
state.token = next_token - 1
do
local token_string = tokenizer:TokenToString(state.token + 1)
token_callback(token_string)
if state.token_pos >= #prompt_tokens and token_string == "<|eot_id|>" then
break
end
end
state.token_pos = state.token_pos + 1
total_time = total_time + get_time() - start_time
end
print("\n\n\n")
if backend == "cuda" then require("compute.gpu_cuda").dump_gpu_stats() end
profiler.Stop()
local token_count = (state.token_pos + 1)
local tokens_per_sec = 1 / (total_time / token_count)
print(
string.format(
"token count: %i\nelapsed: %.2fs\n%.2f tokens/s",
token_count,
total_time,
tokens_per_sec
)
)
if backend == "cuda" and false then -- LOL
local gpu = require("compute.gpu_cuda")
local done = {}
for _, tensor in ipairs(Tensor.GetAll()) do
if not done[tensor.blob.gpu_ptr] then
gpu.free_on_device(tensor.blob.gpu_ptr)
done[tensor.blob.gpu_ptr] = true
end
end
end
end
end
load_and_run(model_path, "<|start_header_id|>user<|end_header_id|>\n" .. prompt .. "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n", function(token)
io.write(token)
io.flush()
end)