From ca45fce8ac49285872ae1a72ca04f880f0d027fb Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Tue, 13 Aug 2024 15:26:52 +0530
Subject: [PATCH 1/3] Break long links in train of thought to stay within chat
 page width

---
 src/interface/web/app/components/chatMessage/chatMessage.tsx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/interface/web/app/components/chatMessage/chatMessage.tsx b/src/interface/web/app/components/chatMessage/chatMessage.tsx
index 2613ef47f..98f74d5a3 100644
--- a/src/interface/web/app/components/chatMessage/chatMessage.tsx
+++ b/src/interface/web/app/components/chatMessage/chatMessage.tsx
@@ -262,7 +262,7 @@ export function TrainOfThought(props: TrainOfThoughtProps) {
     let markdownRendered = DOMPurify.sanitize(md.render(props.message));
     return (
         <div
-            className={`${styles.trainOfThoughtElement} items-center ${props.primary ? "text-gray-400" : "text-gray-300"} ${styles.trainOfThought} ${props.primary ? styles.primary : ""}`}
+            className={`${styles.trainOfThoughtElement} break-all items-center ${props.primary ? "text-gray-400" : "text-gray-300"} ${styles.trainOfThought} ${props.primary ? styles.primary : ""}`}
         >
             {icon}
             <div dangerouslySetInnerHTML={{ __html: markdownRendered }} />

From acdc3f947077caf6b9b8b0958269515c38d88daf Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Fri, 16 Aug 2024 05:20:24 -0500
Subject: [PATCH 2/3] Unwrap any json in md code block, when parsing chat actor
 responses

This is a more robust way to extract json output requested from
gemma-2 (2B, 9B) models which tend to return json in md codeblocks.

Other models should remain unaffected by this change.

Also removed request to not wrap json in codeblocks from prompts. As
code is doing the unwrapping automatically now, when present
---
 pyproject.toml                                        | 2 +-
 src/khoj/processor/conversation/offline/chat_model.py | 3 +++
 src/khoj/processor/conversation/prompts.py            | 2 +-
 src/khoj/routers/helpers.py                           | 6 ++++++
 4 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 4b651dad8..edbbb655a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -66,7 +66,7 @@ dependencies = [
     "pymupdf >= 1.23.5",
     "django == 5.0.7",
     "authlib == 1.2.1",
-    "llama-cpp-python == 0.2.82",
+    "llama-cpp-python == 0.2.88",
     "itsdangerous == 2.1.2",
     "httpx == 0.25.0",
     "pgvector == 0.2.4",
diff --git a/src/khoj/processor/conversation/offline/chat_model.py b/src/khoj/processor/conversation/offline/chat_model.py
index ec4c7367a..1251dceca 100644
--- a/src/khoj/processor/conversation/offline/chat_model.py
+++ b/src/khoj/processor/conversation/offline/chat_model.py
@@ -103,6 +103,9 @@ def extract_questions_offline(
             .replace("']", '"]')
             .replace("', '", '", "')
         )
+        # Remove any markdown json codeblock formatting if present (useful for gemma-2)
+        if response.startswith("```json"):
+            response = response[7:-3]
         questions: List[str] = json.loads(questions_str)
         questions = filter_questions(questions)
     except:
diff --git a/src/khoj/processor/conversation/prompts.py b/src/khoj/processor/conversation/prompts.py
index 6a8db9db0..ffd7d0946 100644
--- a/src/khoj/processor/conversation/prompts.py
+++ b/src/khoj/processor/conversation/prompts.py
@@ -587,7 +587,7 @@
 - Official, up-to-date information about you, Khoj, is available at site:khoj.dev, github or pypi.
 
 What Google searches, if any, will you need to perform to answer the user's question?
-Provide search queries as a list of strings in a JSON object. Do not wrap the json in a codeblock.
+Provide search queries as a list of strings in a JSON object.
 Current Date: {current_date}
 User's Location: {location}
 {username}
diff --git a/src/khoj/routers/helpers.py b/src/khoj/routers/helpers.py
index 4e4f5a56d..4da607173 100644
--- a/src/khoj/routers/helpers.py
+++ b/src/khoj/routers/helpers.py
@@ -279,6 +279,9 @@ async def aget_relevant_information_sources(query: str, conversation_history: di
 
     try:
         response = response.strip()
+        # Remove any markdown json codeblock formatting if present (useful for gemma-2)
+        if response.startswith("```json"):
+            response = response[7:-3]
         response = json.loads(response)
         response = [q.strip() for q in response["source"] if q.strip()]
         if not isinstance(response, list) or not response or len(response) == 0:
@@ -401,6 +404,9 @@ async def generate_online_subqueries(
     # Validate that the response is a non-empty, JSON-serializable list
     try:
         response = response.strip()
+        # Remove any markdown json codeblock formatting if present (useful for gemma-2)
+        if response.startswith("```json") and response.endswith("```"):
+            response = response[7:-3]
         response = json.loads(response)
         response = [q.strip() for q in response["queries"] if q.strip()]
         if not isinstance(response, list) or not response or len(response) == 0:

From 58c806807950733eaf7f063ab73da83f149c5865 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Fri, 16 Aug 2024 07:58:04 -0500
Subject: [PATCH 3/3] Upgrade default offline chat model to llama 3.1

---
 documentation/docs/features/chat.md             |  2 +-
 documentation/docs/get-started/setup.mdx        |  2 +-
 .../0058_alter_chatmodeloptions_chat_model.py   | 17 +++++++++++++++++
 src/khoj/database/models/__init__.py            |  2 +-
 .../conversation/offline/chat_model.py          |  6 +++---
 .../processor/conversation/offline/utils.py     |  2 +-
 src/khoj/processor/conversation/utils.py        |  1 +
 src/khoj/utils/config.py                        |  2 +-
 src/khoj/utils/constants.py                     |  2 +-
 src/khoj/utils/rawconfig.py                     |  2 +-
 tests/conftest.py                               |  2 +-
 tests/helpers.py                                |  2 +-
 12 files changed, 30 insertions(+), 12 deletions(-)
 create mode 100644 src/khoj/database/migrations/0058_alter_chatmodeloptions_chat_model.py

diff --git a/documentation/docs/features/chat.md b/documentation/docs/features/chat.md
index ed4fe9fe1..5876dc765 100644
--- a/documentation/docs/features/chat.md
+++ b/documentation/docs/features/chat.md
@@ -25,7 +25,7 @@ Offline chat stays completely private and can work without internet using open-s
 >  - An Nvidia, AMD GPU or a Mac M1+ machine would significantly speed up chat response times
 
 1. Open your [Khoj offline settings](http://localhost:42110/server/admin/database/offlinechatprocessorconversationconfig/) and click *Enable* on the Offline Chat configuration.
-2. Open your [Chat model options settings](http://localhost:42110/server/admin/database/chatmodeloptions/) and add any [GGUF chat model](https://huggingface.co/models?library=gguf) to use for offline chat. Make sure to use `Offline` as its type. For a balanced chat model that runs well on standard consumer hardware we recommend using [Hermes-2-Pro-Mistral-7B by NousResearch](https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF) by default.
+2. Open your [Chat model options settings](http://localhost:42110/server/admin/database/chatmodeloptions/) and add any [GGUF chat model](https://huggingface.co/models?library=gguf) to use for offline chat. Make sure to use `Offline` as its type. For a balanced chat model that runs well on standard consumer hardware we recommend using [Llama 3.1 by Meta](https://huggingface.co/bartowski/Meta-Llama-3.1-8B-Instruct-GGUF) by default.
 
 
 :::tip[Note]
diff --git a/documentation/docs/get-started/setup.mdx b/documentation/docs/get-started/setup.mdx
index bc954bde4..61d2ef3d0 100644
--- a/documentation/docs/get-started/setup.mdx
+++ b/documentation/docs/get-started/setup.mdx
@@ -222,7 +222,7 @@ Using Ollama? See the [Ollama Integration](/advanced/ollama) section for more cu
 Any chat model on Huggingface in GGUF format can be used for local chat. Here's how you can set it up:
 
 1. No need to setup a conversation processor config!
-2. Go over to configure your [chat model options](http://localhost:42110/server/admin/database/chatmodeloptions/). Set the `chat-model` field to a supported chat model[^1] of your choice. For example, we recommend `NousResearch/Hermes-2-Pro-Mistral-7B-GGUF`, but [any gguf model on huggingface](https://huggingface.co/models?library=gguf) should work.
+2. Go over to configure your [chat model options](http://localhost:42110/server/admin/database/chatmodeloptions/). Set the `chat-model` field to a supported chat model[^1] of your choice. For example, we recommend `bartowski/Meta-Llama-3.1-8B-Instruct-GGUF`, but [any gguf model on huggingface](https://huggingface.co/models?library=gguf) should work.
   - Make sure to set the `model-type` to `Offline`. Do not set `openai config`.
   - The `tokenizer` and `max-prompt-size` fields are optional. You can set these for non-standard models (i.e not Mistral or Llama based models) or when you know the token limit of the model to improve context stuffing.
 
diff --git a/src/khoj/database/migrations/0058_alter_chatmodeloptions_chat_model.py b/src/khoj/database/migrations/0058_alter_chatmodeloptions_chat_model.py
new file mode 100644
index 000000000..ea4515e1c
--- /dev/null
+++ b/src/khoj/database/migrations/0058_alter_chatmodeloptions_chat_model.py
@@ -0,0 +1,17 @@
+# Generated by Django 5.0.7 on 2024-08-19 12:37
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+    dependencies = [
+        ("database", "0057_remove_serverchatsettings_default_model_and_more"),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name="chatmodeloptions",
+            name="chat_model",
+            field=models.CharField(default="bartowski/Meta-Llama-3.1-8B-Instruct-GGUF", max_length=200),
+        ),
+    ]
diff --git a/src/khoj/database/models/__init__.py b/src/khoj/database/models/__init__.py
index 72c93157d..2468ffc9c 100644
--- a/src/khoj/database/models/__init__.py
+++ b/src/khoj/database/models/__init__.py
@@ -91,7 +91,7 @@ class ModelType(models.TextChoices):
     max_prompt_size = models.IntegerField(default=None, null=True, blank=True)
     subscribed_max_prompt_size = models.IntegerField(default=None, null=True, blank=True)
     tokenizer = models.CharField(max_length=200, default=None, null=True, blank=True)
-    chat_model = models.CharField(max_length=200, default="NousResearch/Hermes-2-Pro-Mistral-7B-GGUF")
+    chat_model = models.CharField(max_length=200, default="bartowski/Meta-Llama-3.1-8B-Instruct-GGUF")
     model_type = models.CharField(max_length=200, choices=ModelType.choices, default=ModelType.OFFLINE)
     openai_config = models.ForeignKey(
         OpenAIProcessorConversationConfig, on_delete=models.CASCADE, default=None, null=True, blank=True
diff --git a/src/khoj/processor/conversation/offline/chat_model.py b/src/khoj/processor/conversation/offline/chat_model.py
index 1251dceca..c62d1e005 100644
--- a/src/khoj/processor/conversation/offline/chat_model.py
+++ b/src/khoj/processor/conversation/offline/chat_model.py
@@ -24,7 +24,7 @@
 
 def extract_questions_offline(
     text: str,
-    model: str = "NousResearch/Hermes-2-Pro-Mistral-7B-GGUF",
+    model: str = "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF",
     loaded_model: Union[Any, None] = None,
     conversation_log={},
     use_history: bool = True,
@@ -141,7 +141,7 @@ def converse_offline(
     references=[],
     online_results=[],
     conversation_log={},
-    model: str = "NousResearch/Hermes-2-Pro-Mistral-7B-GGUF",
+    model: str = "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF",
     loaded_model: Union[Any, None] = None,
     completion_func=None,
     conversation_commands=[ConversationCommand.Default],
@@ -240,7 +240,7 @@ def llm_thread(g, messages: List[ChatMessage], model: Any, max_prompt_size: int
 def send_message_to_model_offline(
     messages: List[ChatMessage],
     loaded_model=None,
-    model="NousResearch/Hermes-2-Pro-Mistral-7B-GGUF",
+    model="bartowski/Meta-Llama-3.1-8B-Instruct-GGUF",
     temperature: float = 0.2,
     streaming=False,
     stop=[],
diff --git a/src/khoj/processor/conversation/offline/utils.py b/src/khoj/processor/conversation/offline/utils.py
index 66017b36c..88082ad1f 100644
--- a/src/khoj/processor/conversation/offline/utils.py
+++ b/src/khoj/processor/conversation/offline/utils.py
@@ -75,6 +75,6 @@ def load_model_from_cache(repo_id: str, filename: str, repo_type="models"):
 def infer_max_tokens(model_context_window: int, configured_max_tokens=None) -> int:
     """Infer max prompt size based on device memory and max context window supported by the model"""
     configured_max_tokens = math.inf if configured_max_tokens is None else configured_max_tokens
-    vram_based_n_ctx = int(get_device_memory() / 2e6)  # based on heuristic
+    vram_based_n_ctx = int(get_device_memory() / 1e6)  # based on heuristic
     configured_max_tokens = configured_max_tokens or math.inf  # do not use if set to None
     return min(configured_max_tokens, vram_based_n_ctx, model_context_window)
diff --git a/src/khoj/processor/conversation/utils.py b/src/khoj/processor/conversation/utils.py
index ea7368e6b..251ac197a 100644
--- a/src/khoj/processor/conversation/utils.py
+++ b/src/khoj/processor/conversation/utils.py
@@ -25,6 +25,7 @@
     "gpt-4-turbo-preview": 20000,
     "TheBloke/Mistral-7B-Instruct-v0.2-GGUF": 3500,
     "NousResearch/Hermes-2-Pro-Mistral-7B-GGUF": 3500,
+    "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF": 20000,
 }
 model_to_tokenizer: Dict[str, str] = {}
 
diff --git a/src/khoj/utils/config.py b/src/khoj/utils/config.py
index 0e88075f8..03dad75cb 100644
--- a/src/khoj/utils/config.py
+++ b/src/khoj/utils/config.py
@@ -70,7 +70,7 @@ class OfflineChatProcessorConfig:
 
 
 class OfflineChatProcessorModel:
-    def __init__(self, chat_model: str = "NousResearch/Hermes-2-Pro-Mistral-7B-GGUF", max_tokens: int = None):
+    def __init__(self, chat_model: str = "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF", max_tokens: int = None):
         self.chat_model = chat_model
         self.loaded_model = None
         try:
diff --git a/src/khoj/utils/constants.py b/src/khoj/utils/constants.py
index c3d8a1866..9b7ffb777 100644
--- a/src/khoj/utils/constants.py
+++ b/src/khoj/utils/constants.py
@@ -8,7 +8,7 @@
 app_env_filepath = "~/.khoj/env"
 telemetry_server = "https://khoj.beta.haletic.com/v1/telemetry"
 content_directory = "~/.khoj/content/"
-default_offline_chat_model = "NousResearch/Hermes-2-Pro-Mistral-7B-GGUF"
+default_offline_chat_model = "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF"
 default_online_chat_model = "gpt-4-turbo-preview"
 
 empty_config = {
diff --git a/src/khoj/utils/rawconfig.py b/src/khoj/utils/rawconfig.py
index 617f37ea2..6a788531f 100644
--- a/src/khoj/utils/rawconfig.py
+++ b/src/khoj/utils/rawconfig.py
@@ -93,7 +93,7 @@ class OpenAIProcessorConfig(ConfigBase):
 
 
 class OfflineChatProcessorConfig(ConfigBase):
-    chat_model: Optional[str] = "NousResearch/Hermes-2-Pro-Mistral-7B-GGUF"
+    chat_model: Optional[str] = "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF"
 
 
 class ConversationProcessorConfig(ConfigBase):
diff --git a/tests/conftest.py b/tests/conftest.py
index 61578ce20..0fe9d3604 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -378,7 +378,7 @@ def client_offline_chat(search_config: SearchConfig, default_user2: KhojUser):
 
     # Initialize Processor from Config
     ChatModelOptionsFactory(
-        chat_model="NousResearch/Hermes-2-Pro-Mistral-7B-GGUF",
+        chat_model="bartowski/Meta-Llama-3.1-8B-Instruct-GGUF",
         tokenizer=None,
         max_prompt_size=None,
         model_type="offline",
diff --git a/tests/helpers.py b/tests/helpers.py
index 7894ffa2c..2e8e56715 100644
--- a/tests/helpers.py
+++ b/tests/helpers.py
@@ -49,7 +49,7 @@ class Meta:
 
     max_prompt_size = 3500
     tokenizer = None
-    chat_model = "NousResearch/Hermes-2-Pro-Mistral-7B-GGUF"
+    chat_model = "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF"
     model_type = "offline"
     openai_config = factory.LazyAttribute(
         lambda obj: OpenAIProcessorConversationConfigFactory() if os.getenv("OPENAI_API_KEY") else None