From 165e5c08ac7cb9b5ba5d04fd374bf08019a6e01a Mon Sep 17 00:00:00 2001
From: Hans <hans.chen@bricks.tools>
Date: Mon, 7 Oct 2024 17:45:55 +0800
Subject: [PATCH] feat: bump `llama.cpp` to `b3889`

---
 CMakeLists.txt                |  9 ---------
 patches/llama.patch           | 22 ----------------------
 src/LlamaCompletionWorker.cpp | 12 ++++++------
 src/LlamaContext.cpp          | 16 +++++++---------
 src/common.hpp                |  3 ++-
 src/llama.cpp                 |  2 +-
 6 files changed, 16 insertions(+), 48 deletions(-)
 delete mode 100644 patches/llama.patch
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e3b7aa6..347dd90 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -62,15 +62,6 @@ if (VULKAN_SDK)
   find_package(Vulkan REQUIRED)
 endif()
 
-find_program(PATCH patch REQUIRED)
-
-add_custom_target(
-  patch ALL
-  COMMAND ${PATCH} -p1 -N < ${CMAKE_SOURCE_DIR}/patches/llama.patch || true
-  WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/src/llama.cpp
-  COMMENT "Applying patches"
-)
-
 set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build shared libraries")
 add_subdirectory("src/llama.cpp")
 
diff --git a/patches/llama.patch b/patches/llama.patch
deleted file mode 100644
index f141092..0000000
--- a/patches/llama.patch
+++ /dev/null
@@ -1,22 +0,0 @@
-diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp
-index fa68360b..f9ff7b5d 100644
---- a/ggml/src/ggml-vulkan.cpp
-+++ b/ggml/src/ggml-vulkan.cpp
-@@ -617,9 +617,15 @@ static void ggml_vk_create_pipeline(vk_device& device, vk_pipeline& pipeline, co
-         vk::PipelineCreateFlags(),
-         pipeline_shader_create_info,
-         pipeline->layout);
--    pipeline->pipeline = device->device.createComputePipeline(VK_NULL_HANDLE, compute_pipeline_create_info).value;
- 
--    device->pipelines.push_back(pipeline);
-+    try {
-+        pipeline->pipeline = device->device.createComputePipeline(VK_NULL_HANDLE, compute_pipeline_create_info).value;
-+        device->pipelines.push_back(pipeline);
-+    } catch(vk::UnknownError const&) {
-+        VK_LOG_DEBUG("Failed to create pipeline " << name);
-+        ggml_vk_destroy_pipeline(device->device, pipeline);
-+        pipeline.reset();
-+    }
- }
- 
- static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline) {
diff --git a/src/LlamaCompletionWorker.cpp b/src/LlamaCompletionWorker.cpp
index 9455d33..d23f2fa 100644
--- a/src/LlamaCompletionWorker.cpp
+++ b/src/LlamaCompletionWorker.cpp
@@ -59,13 +59,13 @@ void LlamaCompletionWorker::Execute() {
   size_t n_cur = 0;
   size_t n_input = 0;
   const auto model = _sess->model();
-  const bool add_bos = llama_should_add_bos_token(model);
+  const bool add_bos = llama_add_bos_token(model);
   auto ctx = _sess->context();
 
-  llama_set_rng_seed(ctx, _params.seed);
+  auto sparams = llama_sampler_chain_default_params();
 
-  LlamaCppSampling sampling{llama_sampling_init(_params.sparams),
-                            llama_sampling_free};
+  LlamaCppSampling sampling{gpt_sampler_init(model, _params.sparams),
+                            gpt_sampler_free};
 
   std::vector<llama_token> prompt_tokens =
       ::llama_tokenize(ctx, _params.prompt, add_bos);
@@ -109,8 +109,8 @@ void LlamaCompletionWorker::Execute() {
     }
     // sample the next token
     const llama_token new_token_id =
-        llama_sampling_sample(sampling.get(), ctx, nullptr);
-    llama_sampling_accept(sampling.get(), ctx, new_token_id, true);
+        gpt_sampler_sample(sampling.get(), ctx, -1);
+    gpt_sampler_accept(sampling.get(), new_token_id, true);
     // prepare the next batch
     embd->emplace_back(new_token_id);
     auto token = llama_token_to_piece(ctx, new_token_id);
diff --git a/src/LlamaContext.cpp b/src/LlamaContext.cpp
index ec3fd33..c4a5b9f 100644
--- a/src/LlamaContext.cpp
+++ b/src/LlamaContext.cpp
@@ -75,7 +75,7 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
   params.embedding = get_option<bool>(options, "embedding", false);
   params.n_ctx = get_option<int32_t>(options, "n_ctx", 512);
   params.n_batch = get_option<int32_t>(options, "n_batch", 2048);
-  params.n_threads =
+  params.cpuparams.n_threads =
       get_option<int32_t>(options, "n_threads", cpu_get_num_math() / 2);
   params.n_gpu_layers = get_option<int32_t>(options, "n_gpu_layers", -1);
   params.use_mlock = get_option<bool>(options, "use_mlock", false);
@@ -86,16 +86,14 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
   llama_backend_init();
   llama_numa_init(params.numa);
 
-  llama_model *model;
-  llama_context *ctx;
-  std::tie(model, ctx) = llama_init_from_gpt_params(params);
+  auto result = llama_init_from_gpt_params(params);
 
-  if (model == nullptr || ctx == nullptr) {
+  if (result.model == nullptr || result.context == nullptr) {
     Napi::TypeError::New(env, "Failed to load model")
         .ThrowAsJavaScriptException();
   }
 
-  _sess = std::make_shared<LlamaSession>(model, ctx, params);
+  _sess = std::make_shared<LlamaSession>(result.model, result.context, params);
   _info = gpt_params_get_system_info(params);
 }
 
@@ -167,11 +165,11 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
   params.sparams.penalty_present =
       get_option<float>(options, "penalty_present", 0.00f);
   params.sparams.penalize_nl = get_option<bool>(options, "penalize_nl", false);
-  params.sparams.typical_p = get_option<float>(options, "typical_p", 1.00f);
-  params.ignore_eos = get_option<float>(options, "ignore_eos", false);
+  params.sparams.typ_p = get_option<float>(options, "typical_p", 1.00f);
+  params.sparams.ignore_eos = get_option<float>(options, "ignore_eos", false);
   params.sparams.grammar = get_option<std::string>(options, "grammar", "");
   params.n_keep = get_option<int32_t>(options, "n_keep", 0);
-  params.seed = get_option<int32_t>(options, "seed", LLAMA_DEFAULT_SEED);
+  params.sparams.seed = get_option<int32_t>(options, "seed", LLAMA_DEFAULT_SEED);
   std::vector<std::string> stop_words;
   if (options.Has("stop") && options.Get("stop").IsArray()) {
     auto stop_words_array = options.Get("stop").As<Napi::Array>();
diff --git a/src/common.hpp b/src/common.hpp
index 06e2f5b..68a878f 100644
--- a/src/common.hpp
+++ b/src/common.hpp
@@ -1,6 +1,7 @@
 #pragma once
 
 #include "common/common.h"
+#include "common/sampling.h"
 #include "llama.h"
 #include <memory>
 #include <mutex>
@@ -12,7 +13,7 @@
 
 typedef std::unique_ptr<llama_model, decltype(&llama_free_model)> LlamaCppModel;
 typedef std::unique_ptr<llama_context, decltype(&llama_free)> LlamaCppContext;
-typedef std::unique_ptr<llama_sampling_context, decltype(&llama_sampling_free)>
+typedef std::unique_ptr<gpt_sampler, decltype(&gpt_sampler_free)>
     LlamaCppSampling;
 typedef std::unique_ptr<llama_batch, decltype(&llama_batch_free)> LlamaCppBatch;
 
diff --git a/src/llama.cpp b/src/llama.cpp
index 4730fac..b6d6c52 160000
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -1 +1 @@
-Subproject commit 4730faca618ff9cee0780580145e3cbe86f24876
+Subproject commit b6d6c5289f1c9c677657c380591201ddb210b649