Skip to content

Commit 35a3c99

Browse files
Add support for full CUDA GPU offloading (#105)
Signed-off-by: mudler <[email protected]> Signed-off-by: dependabot[bot] <[email protected]> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
1 parent a796025 commit 35a3c99

File tree

8 files changed

+155
-24
lines changed

8 files changed

+155
-24
lines changed

.github/workflows/test.yaml

+2-12
Original file line numberDiff line numberDiff line change
@@ -27,14 +27,9 @@ jobs:
2727
with:
2828
submodules: true
2929

30-
- name: Dependencies
31-
run: |
32-
brew update
33-
brew install sdl2
34-
3530
- name: Test
3631
run: |
37-
make test
32+
CMAKE_ARGS="-DLLAMA_F16C=OFF -DLLAMA_AVX512=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF" make test
3833
3934
macOS-metal-latest:
4035
runs-on: macOS-latest
@@ -45,12 +40,7 @@ jobs:
4540
with:
4641
submodules: true
4742

48-
- name: Dependencies
49-
run: |
50-
brew update
51-
brew install sdl2
52-
5343
- name: Test
5444
run: |
55-
make BUILD_TYPE=metal test
45+
CMAKE_ARGS="-DLLAMA_F16C=OFF -DLLAMA_AVX512=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF" make BUILD_TYPE=metal test
5646
CGO_LDFLAGS="-framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders" LIBRARY_PATH=$PWD C_INCLUDE_PATH=$PWD go build -o testbuild ./examples

1902.patch

+120
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
From 200892a3a54323eb65ca9c8d8afb6043ca2d8944 Mon Sep 17 00:00:00 2001
2+
From: mudler <[email protected]>
3+
Date: Fri, 16 Jun 2023 23:43:36 +0200
4+
Subject: [PATCH] Pass pointer to params in llama_init_from_file
5+
6+
Especially with golang bindings, calling by value has the side-effect of
7+
values not being copied correctly. This has been observed with the
8+
bindings in https://github.com/go-skynet/go-llama.cpp/pull/105.
9+
---
10+
examples/common.cpp | 2 +-
11+
examples/quantize-stats/quantize-stats.cpp | 2 +-
12+
examples/save-load-state/save-load-state.cpp | 4 ++--
13+
examples/train-text-from-scratch/train-text-from-scratch.cpp | 2 +-
14+
llama.cpp | 3 ++-
15+
llama.h | 2 +-
16+
tests/test-tokenizer-0.cpp | 2 +-
17+
7 files changed, 9 insertions(+), 8 deletions(-)
18+
19+
diff --git a/examples/common.cpp b/examples/common.cpp
20+
index 055383beff9..7cf48e82158 100644
21+
--- a/examples/common.cpp
22+
+++ b/examples/common.cpp
23+
@@ -555,7 +555,7 @@ struct llama_context * llama_init_from_gpt_params(const gpt_params & params) {
24+
lparams.logits_all = params.perplexity;
25+
lparams.embedding = params.embedding;
26+
27+
- llama_context * lctx = llama_init_from_file(params.model.c_str(), lparams);
28+
+ llama_context * lctx = llama_init_from_file(params.model.c_str(), &lparams);
29+
30+
if (lctx == NULL) {
31+
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
32+
diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp
33+
index 6b8018ee284..a7c1e873a92 100644
34+
--- a/examples/quantize-stats/quantize-stats.cpp
35+
+++ b/examples/quantize-stats/quantize-stats.cpp
36+
@@ -330,7 +330,7 @@ int main(int argc, char ** argv) {
37+
lparams.f16_kv = false;
38+
lparams.use_mlock = false;
39+
40+
- ctx = llama_init_from_file(params.model.c_str(), lparams);
41+
+ ctx = llama_init_from_file(params.model.c_str(), &lparams);
42+
43+
if (ctx == NULL) {
44+
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
45+
diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp
46+
index da4d37ad03d..07ee6750d4c 100644
47+
--- a/examples/save-load-state/save-load-state.cpp
48+
+++ b/examples/save-load-state/save-load-state.cpp
49+
@@ -35,7 +35,7 @@ int main(int argc, char ** argv) {
50+
auto last_n_tokens_data = std::vector<llama_token>(params.repeat_last_n, 0);
51+
52+
// init
53+
- auto ctx = llama_init_from_file(params.model.c_str(), lparams);
54+
+ auto ctx = llama_init_from_file(params.model.c_str(), &lparams);
55+
auto tokens = std::vector<llama_token>(params.n_ctx);
56+
auto n_prompt_tokens = llama_tokenize(ctx, params.prompt.c_str(), tokens.data(), int(tokens.size()), true);
57+
58+
@@ -95,7 +95,7 @@ int main(int argc, char ** argv) {
59+
llama_free(ctx);
60+
61+
// load new model
62+
- auto ctx2 = llama_init_from_file(params.model.c_str(), lparams);
63+
+ auto ctx2 = llama_init_from_file(params.model.c_str(), &lparams);
64+
65+
// Load state (rng, logits, embedding and kv_cache) from file
66+
{
67+
diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
68+
index 7ec85951adc..1c7a06c21be 100644
69+
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
70+
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
71+
@@ -3054,7 +3054,7 @@ int main(int argc, char ** argv) {
72+
struct llama_context_params llama_params = llama_context_default_params();
73+
llama_params.vocab_only = true;
74+
75+
- struct llama_context * lctx = llama_init_from_file(params.fn_vocab_model, llama_params);
76+
+ struct llama_context * lctx = llama_init_from_file(params.fn_vocab_model, &llama_params);
77+
78+
struct llama_vocab vocab;
79+
{
80+
diff --git a/llama.cpp b/llama.cpp
81+
index 81f047ed298..0629e873886 100644
82+
--- a/llama.cpp
83+
+++ b/llama.cpp
84+
@@ -2618,8 +2618,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
85+
86+
struct llama_context * llama_init_from_file(
87+
const char * path_model,
88+
- struct llama_context_params params) {
89+
+ const struct llama_context_params * params_ptr) {
90+
ggml_time_init();
91+
+ struct llama_context_params params = *params_ptr;
92+
93+
llama_context * ctx = new llama_context;
94+
95+
diff --git a/llama.h b/llama.h
96+
index 1241ba6c0ec..faf2675f125 100644
97+
--- a/llama.h
98+
+++ b/llama.h
99+
@@ -142,7 +142,7 @@ extern "C" {
100+
// Return NULL on failure
101+
LLAMA_API struct llama_context * llama_init_from_file(
102+
const char * path_model,
103+
- struct llama_context_params params);
104+
+ const struct llama_context_params * params);
105+
106+
// Frees all allocated memory
107+
LLAMA_API void llama_free(struct llama_context * ctx);
108+
diff --git a/tests/test-tokenizer-0.cpp b/tests/test-tokenizer-0.cpp
109+
index ab1538a0cf3..b405df8e687 100644
110+
--- a/tests/test-tokenizer-0.cpp
111+
+++ b/tests/test-tokenizer-0.cpp
112+
@@ -36,7 +36,7 @@ int main(int argc, char **argv) {
113+
114+
lparams.vocab_only = true;
115+
116+
- ctx = llama_init_from_file(fname.c_str(), lparams);
117+
+ ctx = llama_init_from_file(fname.c_str(), &lparams);
118+
119+
if (ctx == NULL) {
120+
fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());

Makefile

+10-4
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,7 @@ $(info )
176176
# Use this if you want to set the default behavior
177177

178178

179-
llama.cpp/ggml.o:
179+
llama.cpp/ggml.o: prepare
180180
mkdir -p build
181181
cd build && cmake ../llama.cpp $(CMAKE_ARGS) && VERBOSE=1 cmake --build . --config Release && cp -rf CMakeFiles/ggml.dir/ggml.c.o ../llama.cpp/ggml.o
182182

@@ -193,16 +193,22 @@ llama.cpp/k_quants.o: llama.cpp/ggml.o
193193
cd build && cp -rf CMakeFiles/ggml.dir/k_quants.c.o ../llama.cpp/k_quants.o
194194

195195
llama.cpp/llama.o:
196-
cd build && make llama.o && cp -rf CMakeFiles/llama.dir/llama.cpp.o ../llama.cpp/llama.o
196+
cd build && cp -rf CMakeFiles/llama.dir/llama.cpp.o ../llama.cpp/llama.o
197197

198198
llama.cpp/common.o:
199-
cd build && make common && cp -rf examples/CMakeFiles/common.dir/common.cpp.o ../llama.cpp/common.o
199+
cd build && cp -rf examples/CMakeFiles/common.dir/common.cpp.o ../llama.cpp/common.o
200200

201-
binding.o: llama.cpp/ggml.o llama.cpp/llama.o llama.cpp/common.o
201+
binding.o: prepare llama.cpp/ggml.o llama.cpp/llama.o llama.cpp/common.o
202202
$(CXX) $(CXXFLAGS) -I./llama.cpp -I./llama.cpp/examples binding.cpp -o binding.o -c $(LDFLAGS)
203203

204+
## https://github.com/ggerganov/llama.cpp/pull/1902
205+
prepare:
206+
cd llama.cpp && patch -p1 < ../1902.patch
207+
touch $@
208+
204209
libbinding.a: binding.o llama.cpp/k_quants.o $(EXTRA_TARGETS)
205210
ar src libbinding.a llama.cpp/ggml.o llama.cpp/k_quants.o $(EXTRA_TARGETS) llama.cpp/common.o llama.cpp/llama.o binding.o
211+
206212
clean:
207213
rm -rf *.o
208214
rm -rf *.a

binding.cpp

+8-4
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#include "common.h"
22
#include "llama.h"
3+
34
#include "binding.h"
45

56
#include <cassert>
@@ -125,7 +126,7 @@ int llama_predict(void* params_ptr, void* state_pr, char* result, bool debug) {
125126

126127
std::mt19937 rng(params.seed);
127128

128-
llama_init_backend();
129+
129130

130131
std::string path_session = params.path_prompt_cache;
131132
std::vector<llama_token> session_tokens;
@@ -590,7 +591,7 @@ void* llama_allocate_params(const char *prompt, int seed, int threads, int token
590591
}
591592

592593

593-
void* load_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool mlock, bool embeddings, bool mmap, int n_gpu_layers, int n_batch, const char *maingpu, const char *tensorsplit) {
594+
void* load_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool mlock, bool embeddings, bool mmap, bool low_vram, bool vocab_only, int n_gpu_layers, int n_batch, const char *maingpu, const char *tensorsplit) {
594595
// load the model
595596
auto lparams = llama_context_default_params();
596597

@@ -601,6 +602,8 @@ void* load_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool
601602
lparams.use_mlock = mlock;
602603
lparams.n_gpu_layers = n_gpu_layers;
603604
lparams.use_mmap = mmap;
605+
lparams.low_vram = low_vram;
606+
lparams.vocab_only = vocab_only;
604607

605608
if (maingpu[0] != '\0') {
606609
lparams.main_gpu = std::stoi(maingpu);
@@ -625,13 +628,14 @@ void* load_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool
625628

626629
lparams.n_batch = n_batch;
627630

631+
llama_init_backend();
628632
void* res = nullptr;
629633
try {
630-
res = llama_init_from_file(fname, lparams);
634+
res = llama_init_from_file(fname, &lparams);
631635
} catch(std::runtime_error& e) {
632636
fprintf(stderr, "failed %s",e.what());
633637
return res;
634638
}
635639

636640
return res;
637-
}
641+
}

binding.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ int eval(void* params_ptr, void *ctx, char*text);
1414

1515
void save_state(void *ctx, char *dst, char*modes);
1616

17-
void* load_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool mlock, bool embeddings, bool mmap, int n_gpu, int n_batch, const char *maingpu, const char *tensorsplit);
17+
void* load_model(const char *fname, int n_ctx, int n_seed, bool memory_f16, bool mlock, bool embeddings, bool mmap, bool low_vram, bool vocab_only, int n_gpu, int n_batch, const char *maingpu, const char *tensorsplit);
1818

1919
int get_embeddings(void* params_ptr, void* state_pr, float * res_embeddings);
2020

llama.cpp

llama.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ type LLama struct {
2323
func New(model string, opts ...ModelOption) (*LLama, error) {
2424
mo := NewModelOptions(opts...)
2525
modelPath := C.CString(model)
26-
result := C.load_model(modelPath, C.int(mo.ContextSize), C.int(mo.Seed), C.bool(mo.F16Memory), C.bool(mo.MLock), C.bool(mo.Embeddings), C.bool(mo.MMap), C.int(mo.NGPULayers), C.int(mo.NBatch), C.CString(mo.MainGPU), C.CString(mo.TensorSplit))
26+
result := C.load_model(modelPath, C.int(mo.ContextSize), C.int(mo.Seed), C.bool(mo.F16Memory), C.bool(mo.MLock), C.bool(mo.Embeddings), C.bool(mo.MMap), C.bool(mo.LowVRAM), C.bool(mo.VocabOnly), C.int(mo.NGPULayers), C.int(mo.NBatch), C.CString(mo.MainGPU), C.CString(mo.TensorSplit))
2727
if result == nil {
2828
return nil, fmt.Errorf("failed loading model")
2929
}

options.go

+12-1
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ type ModelOptions struct {
77
F16Memory bool
88
MLock bool
99
MMap bool
10+
VocabOnly bool
11+
LowVRAM bool
1012
Embeddings bool
1113
NGPULayers int
1214
MainGPU string
@@ -50,6 +52,7 @@ var DefaultModelOptions ModelOptions = ModelOptions{
5052
MLock: false,
5153
Embeddings: false,
5254
MMap: true,
55+
LowVRAM: false,
5356
}
5457

5558
var DefaultOptions PredictOptions = PredictOptions{
@@ -58,7 +61,7 @@ var DefaultOptions PredictOptions = PredictOptions{
5861
Tokens: 128,
5962
Penalty: 1.1,
6063
Repeat: 64,
61-
Batch: 8,
64+
Batch: 512,
6265
NKeep: 64,
6366
TopK: 40,
6467
TopP: 0.95,
@@ -128,6 +131,14 @@ func SetPredictionMainGPU(maingpu string) PredictOption {
128131
}
129132
}
130133

134+
var VocabOnly ModelOption = func(p *ModelOptions) {
135+
p.VocabOnly = true
136+
}
137+
138+
var EnabelLowVRAM ModelOption = func(p *ModelOptions) {
139+
p.LowVRAM = true
140+
}
141+
131142
var EnableEmbeddings ModelOption = func(p *ModelOptions) {
132143
p.Embeddings = true
133144
}

0 commit comments

Comments
 (0)