From 5ef32d5dd58bfd97dca185df7c67bedf281e44c9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sergio=20G=C3=B3mez?= <decano@gmail.com>
Date: Fri, 13 Sep 2024 22:53:39 +0200
Subject: [PATCH] feat: add missed ollama API settings

---
 .../ollama/src/ollama-chat-language-model.ts  | 14 ++++
 packages/ollama/src/ollama-chat-settings.ts   | 75 +++++++++++++++++++
 2 files changed, 89 insertions(+)

diff --git a/packages/ollama/src/ollama-chat-language-model.ts b/packages/ollama/src/ollama-chat-language-model.ts
index 2acfbbe..b11df49 100644
--- a/packages/ollama/src/ollama-chat-language-model.ts
+++ b/packages/ollama/src/ollama-chat-language-model.ts
@@ -77,12 +77,22 @@ export class OllamaChatLanguageModel implements LanguageModelV1 {
       format: responseFormat?.type,
       model: this.modelId,
       options: removeUndefined({
+        f16_kv: this.settings.f16Kv,
         frequency_penalty: frequencyPenalty,
+        low_vram: this.settings.lowVram,
+        main_gpu: this.settings.mainGpu,
+        min_p: this.settings.minP,
         mirostat: this.settings.mirostat,
         mirostat_eta: this.settings.mirostatEta,
         mirostat_tau: this.settings.mirostatTau,
+        num_batch: this.settings.numBatch,
         num_ctx: this.settings.numCtx,
+        num_gpu: this.settings.numGpu,
+        num_keep: this.settings.numKeep,
         num_predict: maxTokens,
+        num_thread: this.settings.numThread,
+        numa: this.settings.numa,
+        penalize_newline: this.settings.penalizeNewline,
         presence_penalty: presencePenalty,
         repeat_last_n: this.settings.repeatLastN,
         repeat_penalty: this.settings.repeatPenalty,
@@ -92,6 +102,10 @@ export class OllamaChatLanguageModel implements LanguageModelV1 {
         tfs_z: this.settings.tfsZ,
         top_k: this.settings.topK ?? topK,
         top_p: topP,
+        typical_p: this.settings.typicalP,
+        use_mlock: this.settings.useMlock,
+        use_mmap: this.settings.useMmap,
+        vocab_only: this.settings.vocabOnly,
       }),
     }
 
diff --git a/packages/ollama/src/ollama-chat-settings.ts b/packages/ollama/src/ollama-chat-settings.ts
index fc53a9d..cd37e37 100644
--- a/packages/ollama/src/ollama-chat-settings.ts
+++ b/packages/ollama/src/ollama-chat-settings.ts
@@ -94,6 +94,26 @@ export interface OllamaChatSettings {
    */
   experimentalStreamTools?: boolean
 
+  /**
+   * Enables the use of half-precision floating point values for key-value memory. This helps in optimizing memory usage. (Default: true)
+   */
+  f16Kv?: boolean
+
+  /**
+   * If set to true, reduces the VRAM usage by trading off speed for memory. (Default: false)
+   */
+  lowVram?: boolean
+
+  /**
+   * Sets which GPU is the main one.
+   */
+  mainGpu?: number
+
+  /**
+   * Minimum cumulative probability for tokens to be considered. (Default: 0.0)
+   */
+  minP?: number
+
   /**
    * Enable Mirostat sampling for controlling perplexity. (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)
    */
@@ -111,11 +131,46 @@ export interface OllamaChatSettings {
    */
   mirostatTau?: number
 
+  /**
+   * Controls whether to use Non-Uniform Memory Access (NUMA) for more efficient memory management. (Default: false)
+   */
+  numa?: boolean
+
+  /**
+   * Sets the number of batches to be processed. (Default: 512)
+   */
+  numBatch?: number
+
   /**
    * Sets the size of the context window used to generate the next token. (Default: 2048)
    */
   numCtx?: number
 
+  /**
+   * Controls the number of GPUs to use for the operation. (Default: -1, indicates that NumGPU should be set dynamically)
+   */
+  numGpu?: number
+
+  /**
+   * Keeps a number of tokens from the context. Controls how many of the previous tokens are retained. (Default: 4)
+   */
+  numKeep?: number
+
+  /**
+   * Controls the number of tokens to predict in a single generation. (Default: -1)
+   */
+  numPredict?: number
+
+  /**
+   * Sets the number of CPU threads to use. (Default: 0, indicates let the runtime decide)
+   */
+  numThread?: number
+
+  /**
+   * Penalizes the model for generating newline characters. If set to true, it discourages the model from generating too many newlines. (Default: true)
+   */
+  penalizeNewline?: boolean
+
   /**
    * Sets how far back for the model to look back to prevent repetition. (Default: 64, 0 = disabled, -1 = num_ctx)
    */
@@ -148,4 +203,24 @@ export interface OllamaChatSettings {
    * @deprecated Use `topK` from AI SDK functions.
    */
   topK?: number
+
+  /**
+   * Controls the "typical" sampling probability. (Default: 1.0)
+   */
+  typicalP?: number
+
+  /**
+   * Locks the memory to prevent swapping, which can be useful for performance optimization. (Default: false)
+   */
+  useMlock?: boolean
+
+  /**
+   * Enables memory mapping to reduce RAM usage. (Default: false)
+   */
+  useMmap?: boolean
+
+  /**
+   * If true, the model will only load the vocabulary without performing further computation. (Default: false)
+   */
+  vocabOnly?: boolean
 }