From b473e95084c286780165568cf0f385f21141d68d Mon Sep 17 00:00:00 2001 From: Bryan Honof Date: Mon, 17 Jun 2024 17:37:55 +0200 Subject: [PATCH 01/11] Add Nix and Flox install instructions (#7899) --- README.md | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/README.md b/README.md index fd75a64ba1a0c..54859946da565 100644 --- a/README.md +++ b/README.md @@ -387,6 +387,30 @@ brew install llama.cpp ``` The formula is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggerganov/llama.cpp/discussions/7668 +### Nix + +On Mac and Linux, the Nix package manager can be used via +``` +nix profile install nixpkgs#llama-cpp +``` +For flake enabled installs. + +Or +``` +nix-env --file '' --install --attr llama-cpp +``` +For non-flake enabled installs. + +This expression is automatically updated within the [nixpkgs repo](https://github.com/NixOS/nixpkgs/blob/nixos-24.05/pkgs/by-name/ll/llama-cpp/package.nix#L164). + +#### Flox + +On Mac and Linux, Flox can be used to install llama.cpp within a Flox environment via +``` +flox install llama-cpp +``` +Flox follows the nixpkgs build of llama.cpp. + ### Metal Build On MacOS, Metal is enabled by default. Using Metal makes the computation run on the GPU. From 7c26775adb579e92b59c82e8084c07a1d0f75e9c Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 17 Jun 2024 19:40:01 +0300 Subject: [PATCH 02/11] llama : disable FA if KV head size do not match (#7982) --- llama.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/llama.cpp b/llama.cpp index dd7020dc0eeab..6194875155851 100644 --- a/llama.cpp +++ b/llama.cpp @@ -16260,6 +16260,11 @@ struct llama_context * llama_new_context_with_model( params.flash_attn = false; } + if (params.flash_attn && model->hparams.n_embd_head_k != model->hparams.n_embd_head_v) { + LLAMA_LOG_WARN("%s: flash_attn requires n_embd_head_k == n_embd_head_v - forcing off\n", __func__); + params.flash_attn = false; + } + if (params.type_v != GGML_TYPE_F16 && !params.flash_attn) { LLAMA_LOG_ERROR("%s: V cache quantization requires flash_attn\n", __func__); return nullptr; From 5b6da187508f49a9fa9d95fa22ae804a0780d256 Mon Sep 17 00:00:00 2001 From: Srihari-mcw <96763064+Srihari-mcw@users.noreply.github.com> Date: Mon, 17 Jun 2024 23:53:17 +0530 Subject: [PATCH 03/11] Make updates to type cast based on compiler instead of OS (#7851) --- ggml-impl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml-impl.h b/ggml-impl.h index 5e77471f332f4..1d23361906c34 100644 --- a/ggml-impl.h +++ b/ggml-impl.h @@ -17,7 +17,7 @@ #define MIN(a, b) ((a) < (b) ? (a) : (b)) #define MAX(a, b) ((a) > (b) ? (a) : (b)) -#if defined(_WIN32) +#if defined(_MSC_VER) #define m512bh(p) p #define m512i(p) p From a94e6ff8774b7c9f950d9545baf0ce35e8d1ed2f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C8=98tefan-Gabriel=20Muscalu?= Date: Mon, 17 Jun 2024 22:08:46 +0300 Subject: [PATCH 04/11] update: support Qwen2-57B-A14B (#7835) * update: convert-hf-to-gguf.py to support Qwen2-57B-A14B * fix: QWEN2MOE support for expert_feed_forward_length previously, expert ff was taken from n_ff (intermediate size) but it is now properly taken from LLM_KV_EXPERT_FEED_FORWARD_LENGTH n_ff_exp and n_ff_shared_exp are now properly calculated * update: convert-hf-to-gguf.py cleanup for Qwen2MoeForCausalLM * fix: QWEN2MOE support for expert_feed_forward_length previously, expert ff was taken from n_ff (intermediate size) but it is now properly taken from LLM_KV_EXPERT_FEED_FORWARD_LENGTH n_ff_exp and n_ff_shexp are now properly calculated --- convert-hf-to-gguf.py | 6 +++++ gguf-py/gguf/constants.py | 31 +++++++++++----------- gguf-py/gguf/gguf_writer.py | 3 +++ llama.cpp | 51 +++++++++++++++++++++++-------------- 4 files changed, 57 insertions(+), 34 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 55ce502dba1c7..a6751cc80e682 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1632,6 +1632,12 @@ def set_gguf_parameters(self): super().set_gguf_parameters() if (n_experts := self.hparams.get("num_experts")) is not None: self.gguf_writer.add_expert_count(n_experts) + if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None: + self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size) + logger.info(f"gguf: expert feed forward length = {moe_intermediate_size}") + if (shared_expert_intermediate_size := self.hparams.get('shared_expert_intermediate_size')) is not None: + self.gguf_writer.add_expert_shared_feed_forward_length(shared_expert_intermediate_size) + logger.info(f"gguf: expert shared feed forward length = {shared_expert_intermediate_size}") _experts: list[dict[str, Tensor]] | None = None diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 8908585ccf957..fb20cfabbcab5 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -33,21 +33,22 @@ class General: FILE_TYPE = "general.file_type" class LLM: - VOCAB_SIZE = "{arch}.vocab_size" - CONTEXT_LENGTH = "{arch}.context_length" - EMBEDDING_LENGTH = "{arch}.embedding_length" - BLOCK_COUNT = "{arch}.block_count" - LEADING_DENSE_BLOCK_COUNT = "{arch}.leading_dense_block_count" - FEED_FORWARD_LENGTH = "{arch}.feed_forward_length" - EXPERT_FEED_FORWARD_LENGTH = "{arch}.expert_feed_forward_length" - USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual" - TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout" - EXPERT_COUNT = "{arch}.expert_count" - EXPERT_USED_COUNT = "{arch}.expert_used_count" - EXPERT_SHARED_COUNT = "{arch}.expert_shared_count" - EXPERT_WEIGHTS_SCALE = "{arch}.expert_weights_scale" - POOLING_TYPE = "{arch}.pooling_type" - LOGIT_SCALE = "{arch}.logit_scale" + VOCAB_SIZE = "{arch}.vocab_size" + CONTEXT_LENGTH = "{arch}.context_length" + EMBEDDING_LENGTH = "{arch}.embedding_length" + BLOCK_COUNT = "{arch}.block_count" + LEADING_DENSE_BLOCK_COUNT = "{arch}.leading_dense_block_count" + FEED_FORWARD_LENGTH = "{arch}.feed_forward_length" + EXPERT_FEED_FORWARD_LENGTH = "{arch}.expert_feed_forward_length" + EXPERT_SHARED_FEED_FORWARD_LENGTH = "{arch}.expert_shared_feed_forward_length" + USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual" + TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout" + EXPERT_COUNT = "{arch}.expert_count" + EXPERT_USED_COUNT = "{arch}.expert_used_count" + EXPERT_SHARED_COUNT = "{arch}.expert_shared_count" + EXPERT_WEIGHTS_SCALE = "{arch}.expert_weights_scale" + POOLING_TYPE = "{arch}.pooling_type" + LOGIT_SCALE = "{arch}.logit_scale" class Attention: HEAD_COUNT = "{arch}.attention.head_count" diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index ed56abfb3c2ea..a697f657b9ac8 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -394,6 +394,9 @@ def add_feed_forward_length(self, length: int) -> None: def add_expert_feed_forward_length(self, length: int) -> None: self.add_uint32(Keys.LLM.EXPERT_FEED_FORWARD_LENGTH.format(arch=self.arch), length) + def add_expert_shared_feed_forward_length(self, length: int) -> None: + self.add_uint32(Keys.LLM.EXPERT_SHARED_FEED_FORWARD_LENGTH.format(arch=self.arch), length) + def add_parallel_residual(self, use: bool) -> None: self.add_bool(Keys.LLM.USE_PARALLEL_RESIDUAL.format(arch=self.arch), use) diff --git a/llama.cpp b/llama.cpp index 6194875155851..e06c851ad5b6c 100644 --- a/llama.cpp +++ b/llama.cpp @@ -286,6 +286,7 @@ enum llm_kv { LLM_KV_LEADING_DENSE_BLOCK_COUNT, LLM_KV_FEED_FORWARD_LENGTH, LLM_KV_EXPERT_FEED_FORWARD_LENGTH, + LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, LLM_KV_USE_PARALLEL_RESIDUAL, LLM_KV_TENSOR_DATA_LAYOUT, LLM_KV_EXPERT_COUNT, @@ -364,21 +365,22 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_GENERAL_SOURCE_URL, "general.source.url" }, { LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source.huggingface.repository" }, - { LLM_KV_VOCAB_SIZE, "%s.vocab_size" }, - { LLM_KV_CONTEXT_LENGTH, "%s.context_length" }, - { LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" }, - { LLM_KV_BLOCK_COUNT, "%s.block_count" }, - { LLM_KV_LEADING_DENSE_BLOCK_COUNT, "%s.leading_dense_block_count" }, - { LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" }, - { LLM_KV_EXPERT_FEED_FORWARD_LENGTH, "%s.expert_feed_forward_length" }, - { LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" }, - { LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" }, - { LLM_KV_EXPERT_COUNT, "%s.expert_count" }, - { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" }, - { LLM_KV_EXPERT_SHARED_COUNT, "%s.expert_shared_count" }, - { LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" }, - { LLM_KV_POOLING_TYPE , "%s.pooling_type" }, - { LLM_KV_LOGIT_SCALE, "%s.logit_scale" }, + { LLM_KV_VOCAB_SIZE, "%s.vocab_size" }, + { LLM_KV_CONTEXT_LENGTH, "%s.context_length" }, + { LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" }, + { LLM_KV_BLOCK_COUNT, "%s.block_count" }, + { LLM_KV_LEADING_DENSE_BLOCK_COUNT, "%s.leading_dense_block_count" }, + { LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" }, + { LLM_KV_EXPERT_FEED_FORWARD_LENGTH, "%s.expert_feed_forward_length" }, + { LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, "%s.expert_shared_feed_forward_length" }, + { LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" }, + { LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" }, + { LLM_KV_EXPERT_COUNT, "%s.expert_count" }, + { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" }, + { LLM_KV_EXPERT_SHARED_COUNT, "%s.expert_shared_count" }, + { LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" }, + { LLM_KV_POOLING_TYPE , "%s.pooling_type" }, + { LLM_KV_LOGIT_SCALE, "%s.logit_scale" }, { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" }, { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" }, @@ -1970,6 +1972,7 @@ struct llama_hparams { uint32_t n_lora_q = 0; uint32_t n_lora_kv = 0; uint32_t n_ff_exp = 0; + uint32_t n_ff_shexp = 0; uint32_t n_expert_shared = 0; float expert_weights_scale = 0.0; @@ -2018,6 +2021,7 @@ struct llama_hparams { if (this->n_lora_q != other.n_lora_q) return true; if (this->n_lora_kv != other.n_lora_kv) return true; if (this->n_ff_exp != other.n_ff_exp) return true; + if (this->n_ff_shexp != other.n_ff_shexp) return true; if (this->n_expert_shared != other.n_expert_shared) return true; if (this->rope_finetuned != other.rope_finetuned) return true; @@ -4455,6 +4459,9 @@ static void llm_load_hparams( } break; case LLM_ARCH_QWEN2MOE: { + ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false); + ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); switch (hparams.n_layer) { case 24: model.type = e_model::MODEL_A2_7B; break; @@ -5240,6 +5247,11 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale); LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul); } + + if (model.arch == LLM_ARCH_QWEN2MOE) { + LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp); + LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp); + } } // Returns false if cancelled by progress_callback @@ -6026,16 +6038,17 @@ static bool llm_load_tensors( GGML_ASSERT(hparams.n_expert_used > 0); // MoE branch - auto n_ff_exp = n_ff / hparams.n_expert_used; + auto n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / hparams.n_expert_used; layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}); layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}); layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}); // Shared expert branch + auto n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff; layer.ffn_gate_inp_shexp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), {n_embd}); - layer.ffn_gate_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff}); - layer.ffn_down_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff, n_embd}); - layer.ffn_up_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff}); + layer.ffn_gate_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp}); + layer.ffn_down_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}); + layer.ffn_up_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp}); } } break; case LLM_ARCH_PHI2: From e6ecc2be470e3c5c6c5c9d8b90aa83a1f7725084 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 18 Jun 2024 09:37:20 +0300 Subject: [PATCH 05/11] whisper : use ggml_backend_sched (whisper/2239) * whisper : use ggml_backend_sched (wip) * use sched in whisper_allocr * whisper : single backend in whisper_context * whisper : remove whisper_state->backends_used * whisper : remove whisper_context->backend * whisper : reset scheduler after init * whisper : fix external encoder (e.g. CoreML) * whisper : cleanup * whisper : handle null GPU buffer types + fix sycl --------- Co-authored-by: slaren --- ggml-backend.c | 15 +++++++++++++-- ggml-backend.h | 3 +++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/ggml-backend.c b/ggml-backend.c index 26dce7f724213..13c71c310c446 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -1706,14 +1706,16 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) { bool backend_ids_changed = false; for (int i = 0; i < sched->graph->n_nodes; i++) { - if (sched->node_backend_ids[i] != sched->prev_node_backend_ids[i]) { + if (sched->node_backend_ids[i] != sched->prev_node_backend_ids[i] && + sched->bufts[sched->node_backend_ids[i]] != sched->bufts[sched->prev_node_backend_ids[i]]) { backend_ids_changed = true; break; } } if (!backend_ids_changed) { for (int i = 0; i < sched->graph->n_leafs; i++) { - if (sched->leaf_backend_ids[i] != sched->prev_leaf_backend_ids[i]) { + if (sched->leaf_backend_ids[i] != sched->prev_leaf_backend_ids[i] && + sched->bufts[sched->leaf_backend_ids[i]] != sched->bufts[sched->prev_leaf_backend_ids[i]]) { backend_ids_changed = true; break; } @@ -1977,6 +1979,15 @@ int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched) { return sched->n_copies; } +int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched) { + return sched->n_backends; +} + +ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i) { + GGML_ASSERT(i >= 0 && i < sched->n_backends); + return sched->backends[i]; +} + size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) { int backend_index = ggml_backend_sched_backend_id(sched, backend); GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends); diff --git a/ggml-backend.h b/ggml-backend.h index 47fd814751795..4a38eeb5c23bd 100644 --- a/ggml-backend.h +++ b/ggml-backend.h @@ -182,6 +182,9 @@ extern "C" { // Initialize backend buffers from a measure graph GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); + GGML_API int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched); + GGML_API ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i); + // Get the number of splits of the last graph GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched); GGML_API int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched); From 5326bcceeb7dd34f16d0fe61b134d1e074a8e65d Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 18 Jun 2024 09:50:45 +0300 Subject: [PATCH 06/11] ggml : sync --- scripts/sync-ggml.last | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index 5042f82ae477f..b6c57ec5e4f14 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -2aae01fd9b8f9399f343cf18f46f38996ef52e2c +5653a195935ea3ac54652644c9daf154dbc1571b From 1193778105c9a81bd38f72c61aaafbaf85dc9c04 Mon Sep 17 00:00:00 2001 From: Abheek Gulati Date: Mon, 17 Jun 2024 23:57:41 -0700 Subject: [PATCH 07/11] readme : update UI list (#7943) --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 54859946da565..40793c8eab880 100644 --- a/README.md +++ b/README.md @@ -209,6 +209,7 @@ Unless otherwise noted these projects are open-source with permissive licensing: - [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT) - [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT) - [AIKit](https://github.com/sozercan/aikit) (MIT) +- [LARS - The LLM & Advanced Referencing Solution](https://github.com/abgulati/LARS) (AGPL) *(to have a project listed here, it should clearly state that it depends on `llama.cpp`)* From b96f9afb0d58b003ac8d1d0c94cd99393a3bc437 Mon Sep 17 00:00:00 2001 From: Frank Mai Date: Tue, 18 Jun 2024 15:11:40 +0800 Subject: [PATCH 08/11] chore: clean useless beam search param (#7985) Signed-off-by: thxCode --- common/common.h | 1 - 1 file changed, 1 deletion(-) diff --git a/common/common.h b/common/common.h index 58ed72f433bdf..9a1dc4a2fe4c1 100644 --- a/common/common.h +++ b/common/common.h @@ -73,7 +73,6 @@ struct gpt_params { int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default) int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs - int32_t n_beams = 0; // if non-zero then use beam search of given width. int32_t grp_attn_n = 1; // group-attention factor int32_t grp_attn_w = 512; // group-attention width int32_t n_print = -1; // print token count every n tokens (-1 = disabled) From 61665277afde2add00c0d387acb94ed5feb95917 Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Tue, 18 Jun 2024 14:00:14 +0200 Subject: [PATCH 09/11] Allow compiling with CUDA without CUDA runtime installed (#7989) On hosts which are not prepared/dedicated to execute code using CUDA it is still possible to compile llama.cpp with CUDA support by just installing the development packages. Missing are the runtime libraries like /usr/lib64/libcuda.so* and currently the link step will fail. The development environment is prepared for such situations. There are stub libraries for all the CUDA libraries available in the $(CUDA_PATH)/lib64/stubs directory. Adding this directory to the end of the search path will not change anything for environments which currently work fine but will enable compiling llama.cpp also in case the runtime code is not available. --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index f94ee393377b7..dddf647cd551d 100644 --- a/Makefile +++ b/Makefile @@ -507,7 +507,7 @@ ifdef LLAMA_CUDA CUDA_PATH ?= /usr/local/cuda endif MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include -DGGML_CUDA_USE_GRAPHS - MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib + MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib OBJS += ggml-cuda.o OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu)) OBJS += $(OBJS_CUDA_TEMP_INST) From 84f6de17f6a8602e7ff7f7c7bda36a73f510a2dd Mon Sep 17 00:00:00 2001 From: jojorne Date: Tue, 18 Jun 2024 09:18:32 -0300 Subject: [PATCH 10/11] Fix no gcc pragma on Windows (#7751) --- sgemm.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sgemm.cpp b/sgemm.cpp index 40ba9d7e9a7b7..bbe263ddd2bb4 100644 --- a/sgemm.cpp +++ b/sgemm.cpp @@ -43,8 +43,10 @@ // [1] J. Tunney, ‘LLaMA Now Goes Faster on CPUs’, Mar. 2024. [Online]. // Available: https://justine.lol/matmul/. [Accessed: 29-Mar-2024]. +#if defined(__GNUC__) #pragma GCC diagnostic ignored "-Wpedantic" #pragma GCC diagnostic ignored "-Wignored-attributes" +#endif #include "sgemm.h" #include "ggml-impl.h" From 91c188d6c296bd3384f2a02a83b71187aa3d18b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Tue, 18 Jun 2024 14:19:45 +0200 Subject: [PATCH 11/11] Only use FIM middle token if it exists (#7648) * Only use FIM middle if it exists * Only use FIM middle if it exists --- examples/infill/infill.cpp | 13 +++++++++++-- examples/server/server.cpp | 7 ++++++- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp index 0e4ec79c693fa..3e82e4a81a20b 100644 --- a/examples/infill/infill.cpp +++ b/examples/infill/infill.cpp @@ -223,7 +223,11 @@ int main(int argc, char ** argv) { inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model)); embd_inp = inp_pfx; embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end()); - embd_inp.push_back(llama_token_middle(model)); + + const llama_token middle_token = llama_token_middle(model); + if (middle_token >= 0) { + embd_inp.push_back(middle_token); + } LOG("prefix: \"%s\"\n", log_tostr(params.input_prefix)); LOG("suffix: \"%s\"\n", log_tostr(params.input_suffix)); @@ -528,7 +532,12 @@ int main(int argc, char ** argv) { inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model)); embd_inp = inp_pfx; embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end()); - embd_inp.push_back(llama_token_middle(model)); + + const llama_token middle_token = llama_token_middle(model); + if (middle_token >= 0) { + embd_inp.push_back(middle_token); + } + embd.clear(); n_remain = params.n_predict; n_past = 0; diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 919078f2bd920..ec59307b2881b 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2038,7 +2038,12 @@ struct server_context { prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(model)); // always add BOS prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(model)); prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end()); - prefix_tokens.push_back(llama_token_middle(model)); + + const llama_token middle_token = llama_token_middle(model); + if (middle_token >= 0) { + prefix_tokens.push_back(middle_token); + } + prompt_tokens = prefix_tokens; } else { prompt_tokens = tokenize(slot.prompt, system_prompt.empty()); // add BOS if there isn't system prompt