From f650efc051bb16c9026bd7eec2aab3c82f5c1341 Mon Sep 17 00:00:00 2001 From: Jhen-Jie Hong Date: Mon, 18 Nov 2024 17:57:45 +0800 Subject: [PATCH] feat(cpp): sync llama cpp (#53) * feat(cpp): sync llama.cpp * fix(cpp): migrate apis * fix(cmake): set LLAMA_BUILD_COMMON --- CMakeLists.txt | 2 ++ src/DetokenizeWorker.cpp | 2 +- src/EmbeddingWorker.cpp | 4 ++-- src/LlamaCompletionWorker.cpp | 16 ++++++++-------- src/LlamaCompletionWorker.h | 4 ++-- src/LlamaContext.cpp | 17 ++++++++--------- src/TokenizeWorker.cpp | 2 +- src/common.hpp | 8 ++++---- src/llama.cpp | 2 +- 9 files changed, 29 insertions(+), 28 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 347dd90..3cda20d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -62,6 +62,8 @@ if (VULKAN_SDK) find_package(Vulkan REQUIRED) endif() +set(LLAMA_BUILD_COMMON ON CACHE BOOL "Build common") + set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build shared libraries") add_subdirectory("src/llama.cpp") diff --git a/src/DetokenizeWorker.cpp b/src/DetokenizeWorker.cpp index 8b7d9ac..4703910 100644 --- a/src/DetokenizeWorker.cpp +++ b/src/DetokenizeWorker.cpp @@ -8,7 +8,7 @@ DetokenizeWorker::DetokenizeWorker(const Napi::CallbackInfo &info, _tokens(std::move(tokens)) {} void DetokenizeWorker::Execute() { - const auto text = ::llama_detokenize(_sess->context(), _tokens); + const auto text = ::common_detokenize(_sess->context(), _tokens); _text = std::move(text); } diff --git a/src/EmbeddingWorker.cpp b/src/EmbeddingWorker.cpp index 9e268e9..a76368a 100644 --- a/src/EmbeddingWorker.cpp +++ b/src/EmbeddingWorker.cpp @@ -7,7 +7,7 @@ EmbeddingWorker::EmbeddingWorker(const Napi::CallbackInfo &info, void EmbeddingWorker::Execute() { llama_kv_cache_clear(_sess->context()); - auto tokens = ::llama_tokenize(_sess->context(), _text, true); + auto tokens = ::common_tokenize(_sess->context(), _text, true); // add SEP if not present if (tokens.empty() || tokens.back() != llama_token_sep(_sess->model())) { tokens.push_back(llama_token_sep(_sess->model())); @@ -16,7 +16,7 @@ void EmbeddingWorker::Execute() { do { int ret = llama_decode(_sess->context(), - llama_batch_get_one(tokens.data(), tokens.size(), 0, 0)); + llama_batch_get_one(tokens.data(), tokens.size())); if (ret < 0) { SetError("Failed to inference, code: " + std::to_string(ret)); break; diff --git a/src/LlamaCompletionWorker.cpp b/src/LlamaCompletionWorker.cpp index d23f2fa..ad8ced3 100644 --- a/src/LlamaCompletionWorker.cpp +++ b/src/LlamaCompletionWorker.cpp @@ -34,7 +34,7 @@ size_t findStoppingStrings(const std::string &text, LlamaCompletionWorker::LlamaCompletionWorker( const Napi::CallbackInfo &info, LlamaSessionPtr &sess, - Napi::Function callback, gpt_params params, + Napi::Function callback, common_params params, std::vector stop_words) : AsyncWorker(info.Env()), Deferred(info.Env()), _sess(sess), _params(params), _stop_words(stop_words) { @@ -64,11 +64,11 @@ void LlamaCompletionWorker::Execute() { auto sparams = llama_sampler_chain_default_params(); - LlamaCppSampling sampling{gpt_sampler_init(model, _params.sparams), - gpt_sampler_free}; + LlamaCppSampling sampling{common_sampler_init(model, _params.sparams), + common_sampler_free}; std::vector prompt_tokens = - ::llama_tokenize(ctx, _params.prompt, add_bos); + ::common_tokenize(ctx, _params.prompt, add_bos); n_input = prompt_tokens.size(); if (_sess->tokens_ptr()->size() > 0) { n_cur = common_part(*(_sess->tokens_ptr()), prompt_tokens); @@ -102,18 +102,18 @@ void LlamaCompletionWorker::Execute() { _result.truncated = true; } int ret = llama_decode( - ctx, llama_batch_get_one(embd->data() + n_cur, n_input, n_cur, 0)); + ctx, llama_batch_get_one(embd->data() + n_cur, n_input)); if (ret < 0) { SetError("Failed to decode token, code: " + std::to_string(ret)); break; } // sample the next token const llama_token new_token_id = - gpt_sampler_sample(sampling.get(), ctx, -1); - gpt_sampler_accept(sampling.get(), new_token_id, true); + common_sampler_sample(sampling.get(), ctx, -1); + common_sampler_accept(sampling.get(), new_token_id, true); // prepare the next batch embd->emplace_back(new_token_id); - auto token = llama_token_to_piece(ctx, new_token_id); + auto token = common_token_to_piece(ctx, new_token_id); _result.text += token; n_cur += n_input; _result.tokens_evaluated += n_input; diff --git a/src/LlamaCompletionWorker.h b/src/LlamaCompletionWorker.h index 8a55842..8a2a578 100644 --- a/src/LlamaCompletionWorker.h +++ b/src/LlamaCompletionWorker.h @@ -12,7 +12,7 @@ class LlamaCompletionWorker : public Napi::AsyncWorker, public Napi::Promise::Deferred { public: LlamaCompletionWorker(const Napi::CallbackInfo &info, LlamaSessionPtr &sess, - Napi::Function callback, gpt_params params, + Napi::Function callback, common_params params, std::vector stop_words = {}); ~LlamaCompletionWorker(); @@ -28,7 +28,7 @@ class LlamaCompletionWorker : public Napi::AsyncWorker, private: LlamaSessionPtr _sess; - gpt_params _params; + common_params _params; std::vector _stop_words; Napi::ThreadSafeFunction _tsfn; bool _has_callback = false; diff --git a/src/LlamaContext.cpp b/src/LlamaContext.cpp index c4a5b9f..9c94384 100644 --- a/src/LlamaContext.cpp +++ b/src/LlamaContext.cpp @@ -7,8 +7,8 @@ #include "SaveSessionWorker.h" #include "TokenizeWorker.h" -std::vector get_messages(Napi::Array messages) { - std::vector chat; +std::vector get_messages(Napi::Array messages) { + std::vector chat; for (size_t i = 0; i < messages.Length(); i++) { auto message = messages.Get(i).As(); chat.push_back({ @@ -67,7 +67,7 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info) } auto options = info[0].As(); - gpt_params params; + common_params params; params.model = get_option(options, "model", ""); if (params.model.empty()) { Napi::TypeError::New(env, "Model is required").ThrowAsJavaScriptException(); @@ -86,7 +86,7 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info) llama_backend_init(); llama_numa_init(params.numa); - auto result = llama_init_from_gpt_params(params); + auto result = common_init_from_params(params); if (result.model == nullptr || result.context == nullptr) { Napi::TypeError::New(env, "Failed to load model") @@ -94,7 +94,7 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info) } _sess = std::make_shared(result.model, result.context, params); - _info = gpt_params_get_system_info(params); + _info = common_params_get_system_info(params); } // getSystemInfo(): string @@ -109,7 +109,7 @@ Napi::Value LlamaContext::GetFormattedChat(const Napi::CallbackInfo &info) { Napi::TypeError::New(env, "Array expected").ThrowAsJavaScriptException(); } auto messages = info[0].As(); - auto formatted = llama_chat_apply_template(_sess->model(), "", get_messages(messages), true); + auto formatted = common_chat_apply_template(_sess->model(), "", get_messages(messages), true); return Napi::String::New(env, formatted); } @@ -133,10 +133,10 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) { } auto options = info[0].As(); - gpt_params params = _sess->params(); + common_params params = _sess->params(); if (options.Has("messages") && options.Get("messages").IsArray()) { auto messages = options.Get("messages").As(); - auto formatted = llama_chat_apply_template(_sess->model(), "", get_messages(messages), true); + auto formatted = common_chat_apply_template(_sess->model(), "", get_messages(messages), true); params.prompt = formatted; } else { params.prompt = get_option(options, "prompt", ""); @@ -150,7 +150,6 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) { params.sparams.top_k = get_option(options, "top_k", 40); params.sparams.top_p = get_option(options, "top_p", 0.95f); params.sparams.min_p = get_option(options, "min_p", 0.05f); - params.sparams.tfs_z = get_option(options, "tfs_z", 1.00f); params.sparams.mirostat = get_option(options, "mirostat", 0.00f); params.sparams.mirostat_tau = get_option(options, "mirostat_tau", 5.00f); diff --git a/src/TokenizeWorker.cpp b/src/TokenizeWorker.cpp index 5a50542..383de40 100644 --- a/src/TokenizeWorker.cpp +++ b/src/TokenizeWorker.cpp @@ -6,7 +6,7 @@ TokenizeWorker::TokenizeWorker(const Napi::CallbackInfo &info, : AsyncWorker(info.Env()), Deferred(info.Env()), _sess(sess), _text(text) {} void TokenizeWorker::Execute() { - const auto tokens = ::llama_tokenize(_sess->context(), _text, false); + const auto tokens = ::common_tokenize(_sess->context(), _text, false); _result.tokens = std::move(tokens); } diff --git a/src/common.hpp b/src/common.hpp index 68a878f..ff2230d 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -13,7 +13,7 @@ typedef std::unique_ptr LlamaCppModel; typedef std::unique_ptr LlamaCppContext; -typedef std::unique_ptr +typedef std::unique_ptr LlamaCppSampling; typedef std::unique_ptr LlamaCppBatch; @@ -47,7 +47,7 @@ constexpr T get_option(const Napi::Object &options, const std::string &name, class LlamaSession { public: - LlamaSession(llama_model *model, llama_context *ctx, gpt_params params) + LlamaSession(llama_model *model, llama_context *ctx, common_params params) : model_(LlamaCppModel(model, llama_free_model)), ctx_(LlamaCppContext(ctx, llama_free)), params_(params) { tokens_.reserve(params.n_ctx); @@ -65,7 +65,7 @@ class LlamaSession { tokens_ = std::move(tokens); } - inline const gpt_params ¶ms() const { return params_; } + inline const common_params ¶ms() const { return params_; } inline std::mutex &get_mutex() { return mutex; } @@ -79,7 +79,7 @@ class LlamaSession { private: LlamaCppModel model_; LlamaCppContext ctx_; - const gpt_params params_; + const common_params params_; std::vector tokens_{}; std::mutex mutex; }; diff --git a/src/llama.cpp b/src/llama.cpp index b6d6c52..75207b3 160000 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -1 +1 @@ -Subproject commit b6d6c5289f1c9c677657c380591201ddb210b649 +Subproject commit 75207b3a887f91f813de1eb6e9fd135d3cb2b8c6