From 5ef32d5dd58bfd97dca185df7c67bedf281e44c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergio=20G=C3=B3mez?= Date: Fri, 13 Sep 2024 22:53:39 +0200 Subject: [PATCH] feat: add missed ollama API settings --- .../ollama/src/ollama-chat-language-model.ts | 14 ++++ packages/ollama/src/ollama-chat-settings.ts | 75 +++++++++++++++++++ 2 files changed, 89 insertions(+) diff --git a/packages/ollama/src/ollama-chat-language-model.ts b/packages/ollama/src/ollama-chat-language-model.ts index 2acfbbe..b11df49 100644 --- a/packages/ollama/src/ollama-chat-language-model.ts +++ b/packages/ollama/src/ollama-chat-language-model.ts @@ -77,12 +77,22 @@ export class OllamaChatLanguageModel implements LanguageModelV1 { format: responseFormat?.type, model: this.modelId, options: removeUndefined({ + f16_kv: this.settings.f16Kv, frequency_penalty: frequencyPenalty, + low_vram: this.settings.lowVram, + main_gpu: this.settings.mainGpu, + min_p: this.settings.minP, mirostat: this.settings.mirostat, mirostat_eta: this.settings.mirostatEta, mirostat_tau: this.settings.mirostatTau, + num_batch: this.settings.numBatch, num_ctx: this.settings.numCtx, + num_gpu: this.settings.numGpu, + num_keep: this.settings.numKeep, num_predict: maxTokens, + num_thread: this.settings.numThread, + numa: this.settings.numa, + penalize_newline: this.settings.penalizeNewline, presence_penalty: presencePenalty, repeat_last_n: this.settings.repeatLastN, repeat_penalty: this.settings.repeatPenalty, @@ -92,6 +102,10 @@ export class OllamaChatLanguageModel implements LanguageModelV1 { tfs_z: this.settings.tfsZ, top_k: this.settings.topK ?? topK, top_p: topP, + typical_p: this.settings.typicalP, + use_mlock: this.settings.useMlock, + use_mmap: this.settings.useMmap, + vocab_only: this.settings.vocabOnly, }), } diff --git a/packages/ollama/src/ollama-chat-settings.ts b/packages/ollama/src/ollama-chat-settings.ts index fc53a9d..cd37e37 100644 --- a/packages/ollama/src/ollama-chat-settings.ts +++ b/packages/ollama/src/ollama-chat-settings.ts @@ -94,6 +94,26 @@ export interface OllamaChatSettings { */ experimentalStreamTools?: boolean + /** + * Enables the use of half-precision floating point values for key-value memory. This helps in optimizing memory usage. (Default: true) + */ + f16Kv?: boolean + + /** + * If set to true, reduces the VRAM usage by trading off speed for memory. (Default: false) + */ + lowVram?: boolean + + /** + * Sets which GPU is the main one. + */ + mainGpu?: number + + /** + * Minimum cumulative probability for tokens to be considered. (Default: 0.0) + */ + minP?: number + /** * Enable Mirostat sampling for controlling perplexity. (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0) */ @@ -111,11 +131,46 @@ export interface OllamaChatSettings { */ mirostatTau?: number + /** + * Controls whether to use Non-Uniform Memory Access (NUMA) for more efficient memory management. (Default: false) + */ + numa?: boolean + + /** + * Sets the number of batches to be processed. (Default: 512) + */ + numBatch?: number + /** * Sets the size of the context window used to generate the next token. (Default: 2048) */ numCtx?: number + /** + * Controls the number of GPUs to use for the operation. (Default: -1, indicates that NumGPU should be set dynamically) + */ + numGpu?: number + + /** + * Keeps a number of tokens from the context. Controls how many of the previous tokens are retained. (Default: 4) + */ + numKeep?: number + + /** + * Controls the number of tokens to predict in a single generation. (Default: -1) + */ + numPredict?: number + + /** + * Sets the number of CPU threads to use. (Default: 0, indicates let the runtime decide) + */ + numThread?: number + + /** + * Penalizes the model for generating newline characters. If set to true, it discourages the model from generating too many newlines. (Default: true) + */ + penalizeNewline?: boolean + /** * Sets how far back for the model to look back to prevent repetition. (Default: 64, 0 = disabled, -1 = num_ctx) */ @@ -148,4 +203,24 @@ export interface OllamaChatSettings { * @deprecated Use `topK` from AI SDK functions. */ topK?: number + + /** + * Controls the "typical" sampling probability. (Default: 1.0) + */ + typicalP?: number + + /** + * Locks the memory to prevent swapping, which can be useful for performance optimization. (Default: false) + */ + useMlock?: boolean + + /** + * Enables memory mapping to reduce RAM usage. (Default: false) + */ + useMmap?: boolean + + /** + * If true, the model will only load the vocabulary without performing further computation. (Default: false) + */ + vocabOnly?: boolean }