Skip to content

Commit

Permalink
Add support for Llama 3 in Khoj offline mode
Browse files Browse the repository at this point in the history
- Improve extract question prompts to explicitly request JSON list
- Use llama-3 chat format if HF repo_id mentions llama-3. The
  llama-cpp-python logic for detecting when to use llama-3 chat format
  isn't robust enough currently
  • Loading branch information
debanjum committed Apr 24, 2024
1 parent 8e77b3d commit a2e4e4b
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 2 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ dependencies = [
"pymupdf >= 1.23.5",
"django == 4.2.10",
"authlib == 1.2.1",
"llama-cpp-python == 0.2.56",
"llama-cpp-python == 0.2.64",
"itsdangerous == 2.1.2",
"httpx == 0.25.0",
"pgvector == 0.2.4",
Expand Down
7 changes: 6 additions & 1 deletion src/khoj/processor/conversation/offline/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import logging
import math
import os
from typing import Any, Dict

from huggingface_hub.constants import HF_HUB_CACHE

Expand All @@ -14,12 +15,16 @@
def download_model(repo_id: str, filename: str = "*Q4_K_M.gguf", max_tokens: int = None):
# Initialize Model Parameters
# Use n_ctx=0 to get context size from the model
kwargs = {"n_threads": 4, "n_ctx": 0, "verbose": False}
kwargs: Dict[str, Any] = {"n_threads": 4, "n_ctx": 0, "verbose": False}

# Decide whether to load model to GPU or CPU
device = "gpu" if state.chat_on_gpu and state.device != "cpu" else "cpu"
kwargs["n_gpu_layers"] = -1 if device == "gpu" else 0

# Add chat format if known
if "llama-3" in repo_id.lower():
kwargs["chat_format"] = "llama-3"

# Check if the model is already downloaded
model_path = load_model_from_cache(repo_id, filename)
chat_model = None
Expand Down

0 comments on commit a2e4e4b

Please sign in to comment.