diff --git a/CMakeLists.txt b/CMakeLists.txt index 8c865bf..f84b480 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,7 +7,7 @@ project (llama-node) set(CMAKE_CXX_STANDARD 17) execute_process(COMMAND - git apply ${CMAKE_CURRENT_SOURCE_DIR}/scripts/ggml-cpu-CMakeLists.txt.patch + git apply ${CMAKE_CURRENT_SOURCE_DIR}/scripts/llama.cpp.patch WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} ) diff --git a/lib/binding.ts b/lib/binding.ts index b8ef394..00e4927 100644 --- a/lib/binding.ts +++ b/lib/binding.ts @@ -16,6 +16,7 @@ export type LlamaModelOptions = { n_gpu_layers?: number use_mlock?: boolean use_mmap?: boolean + vocab_only?: boolean } export type LlamaCompletionOptions = { diff --git a/scripts/ggml-cpu-CMakeLists.txt.patch b/scripts/ggml-cpu-CMakeLists.txt.patch deleted file mode 100644 index dd00275..0000000 --- a/scripts/ggml-cpu-CMakeLists.txt.patch +++ /dev/null @@ -1,13 +0,0 @@ -diff --git a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt b/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt -index 683b90af..e1bf104c 100644 ---- a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt -+++ b/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt -@@ -80,7 +80,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name) - message(STATUS "ARM detected") - - if (MSVC AND NOT CMAKE_C_COMPILER_ID STREQUAL "Clang") -- message(FATAL_ERROR "MSVC is not supported for ARM, use clang") -+ list(APPEND ARCH_FLAGS /arch:armv8.7) - else() - check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E) - if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "") diff --git a/scripts/llama.cpp.patch b/scripts/llama.cpp.patch new file mode 100644 index 0000000..0667a20 --- /dev/null +++ b/scripts/llama.cpp.patch @@ -0,0 +1,37 @@ +diff --git a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt b/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +index 683b90af..e1bf104c 100644 +--- a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt ++++ b/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +@@ -80,7 +80,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name) + message(STATUS "ARM detected") + + if (MSVC AND NOT CMAKE_C_COMPILER_ID STREQUAL "Clang") +- message(FATAL_ERROR "MSVC is not supported for ARM, use clang") ++ list(APPEND ARCH_FLAGS /arch:armv8.7) + else() + check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E) + if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "") +diff --git a/src/llama.cpp/common/common.h b/src/llama.cpp/common/common.h +index 1d2bd932..b5007c66 100644 +--- a/src/llama.cpp/common/common.h ++++ b/src/llama.cpp/common/common.h +@@ -183,6 +183,7 @@ struct common_params_vocoder { + }; + + struct common_params { ++ bool vocab_only = false; + int32_t n_predict = -1; // new tokens to predict + int32_t n_ctx = 4096; // context size + int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS) +diff --git a/src/llama.cpp/common/common.cpp b/src/llama.cpp/common/common.cpp +index 20be9291..1bedc55d 100644 +--- a/src/llama.cpp/common/common.cpp ++++ b/src/llama.cpp/common/common.cpp +@@ -1017,6 +1017,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) { + if (params.n_gpu_layers != -1) { + mparams.n_gpu_layers = params.n_gpu_layers; + } ++ mparams.vocab_only = params.vocab_only; + mparams.rpc_servers = params.rpc_servers.c_str(); + mparams.main_gpu = params.main_gpu; + mparams.split_mode = params.split_mode; diff --git a/src/LlamaContext.cpp b/src/LlamaContext.cpp index 2474a69..96b2e8d 100644 --- a/src/LlamaContext.cpp +++ b/src/LlamaContext.cpp @@ -76,6 +76,11 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info) Napi::TypeError::New(env, "Model is required").ThrowAsJavaScriptException(); } + params.vocab_only = get_option(options, "vocab_only", false); + if (params.vocab_only) { + params.warmup = false; + } + params.n_ctx = get_option(options, "n_ctx", 512); params.n_batch = get_option(options, "n_batch", 2048); params.embedding = get_option(options, "embedding", false); diff --git a/test/__snapshots__/index.test.ts.snap b/test/__snapshots__/index.test.ts.snap index e97995e..e9329ff 100644 --- a/test/__snapshots__/index.test.ts.snap +++ b/test/__snapshots__/index.test.ts.snap @@ -413,7 +413,7 @@ exports[`tokeneize 3`] = ` " `; -exports[`work fine 1`] = ` +exports[`works fine 1`] = ` { "text": " swochadoorter scientific WindowsCa occupiedrå alta", "timings": "Timings: (8) keys", @@ -423,7 +423,63 @@ exports[`work fine 1`] = ` } `; -exports[`work fine: model info 1`] = ` +exports[`works fine with vocab_only: empty result 1`] = ` +{ + "text": "", + "timings": { + "predicted_ms": 0, + "predicted_n": 1, + "predicted_per_second": Infinity, + "predicted_per_token_ms": 0, + "prompt_ms": 0, + "prompt_n": 1, + "prompt_per_second": Infinity, + "prompt_per_token_ms": 0, + }, + "tokens_evaluated": 0, + "tokens_predicted": 0, + "truncated": false, +} +`; + +exports[`works fine with vocab_only: model info 1`] = ` +{ + "desc": "llama ?B all F32", + "isChatTemplateSupported": false, + "metadata": { + "general.architecture": "llama", + "general.file_type": "1", + "general.name": "LLaMA v2", + "llama.attention.head_count": "2", + "llama.attention.head_count_kv": "2", + "llama.attention.layer_norm_rms_epsilon": "0.000010", + "llama.block_count": "1", + "llama.context_length": "4096", + "llama.embedding_length": "8", + "llama.feed_forward_length": "32", + "llama.rope.dimension_count": "4", + "tokenizer.ggml.bos_token_id": "1", + "tokenizer.ggml.eos_token_id": "2", + "tokenizer.ggml.model": "llama", + "tokenizer.ggml.unknown_token_id": "0", + }, + "nParams": 513048, + "size": 1026144, +} +`; + +exports[`works fine with vocab_only: tokenize 1`] = ` +{ + "tokens": Int32Array [ + 9038, + 2501, + 263, + 931, + ], +} +`; + +exports[`works fine: model info 1`] = ` { "desc": "llama ?B F16", "isChatTemplateSupported": false, diff --git a/test/index.test.ts b/test/index.test.ts index 9519a95..1dc9cf3 100644 --- a/test/index.test.ts +++ b/test/index.test.ts @@ -2,7 +2,7 @@ import path from 'path' import waitForExpect from 'wait-for-expect' import { loadModel } from '../lib' -it('work fine', async () => { +it('works fine', async () => { let tokens = '' const model = await loadModel({ model: path.resolve(__dirname, './tiny-random-llama.gguf') }) const info = model.getModelInfo() @@ -30,6 +30,13 @@ it('work fine', async () => { await model.release() }) +it('works fine with vocab_only', async () => { + const model = await loadModel({ model: path.resolve(__dirname, './tiny-random-llama.gguf'), vocab_only: true }) + expect(model.getModelInfo()).toMatchSnapshot('model info') + expect(await model.tokenize('Once upon a time')).toMatchSnapshot('tokenize') + expect(await model.completion({ prompt: 'Once upon a time' })).toMatchSnapshot('empty result') +}) + it('tokeneize', async () => { const model = await loadModel({ model: path.resolve(__dirname, './tiny-random-llama.gguf') }) {