From 7336f19df0cc97ad173805800648c69e1fa56a22 Mon Sep 17 00:00:00 2001
From: Henry Chen <1474479+chenhunghan@users.noreply.github.com>
Date: Wed, 19 Jul 2023 19:20:05 +0300
Subject: [PATCH] Add support for llama-2, upgrade ctransformer to 0.2.13 (#9)

Signed-off-by: Hung-Han (Henry) Chen <chenhungh@gmail.com>
---
 README.md                            | 29 +++++++++++++++++++++++++++
 charts/ialacol/Chart.yaml            |  4 ++--
 examples/values/llama2-13b-chat.yaml | 30 ++++++++++++++++++++++++++++
 examples/values/llama2-70b-chat.yaml | 30 ++++++++++++++++++++++++++++
 examples/values/llama2-7b-chat.yaml  | 30 ++++++++++++++++++++++++++++
 main.py                              |  4 ++++
 requirements.txt                     |  2 +-
 values.yaml                          | 11 ----------
 8 files changed, 126 insertions(+), 14 deletions(-)
 create mode 100644 examples/values/llama2-13b-chat.yaml
 create mode 100644 examples/values/llama2-70b-chat.yaml
 create mode 100644 examples/values/llama2-7b-chat.yaml
 delete mode 100644 values.yaml

diff --git a/README.md b/README.md
index 337bc95..d95cf1e 100644
--- a/README.md
+++ b/README.md
@@ -14,6 +14,7 @@ This project is inspired by other similar projects like [LocalAI](https://github
 
 See "Receipts" below for instructions of deployments.
 
+- [LLaMa 2 variants](https://huggingface.co/meta-llama)
 - [OpenLLaMA variants](https://github.com/openlm-research/open_llama)
 - [StarCoder variants](https://huggingface.co/bigcode/starcoder)
 - [WizardCoder](https://huggingface.co/WizardLM/WizardCoder-15B-V1.0)
@@ -90,6 +91,34 @@ print(chat_completion.choices[0].message.content)
 
 ## Receipts
 
+### Llama-2
+
+Deploy [Meta's Llama 2 Chat](https://huggingface.co/meta-llama) model quantized by [TheBloke](https://huggingface.co/TheBloke).
+
+7B Chat
+
+```sh
+helm repo add ialacol https://chenhunghan.github.io/ialacol
+helm repo update
+helm install llama2-7b-chat ialacol/ialacol -f examples/values/llama2-7b-chat.yaml
+```
+
+13B Chat
+
+```sh
+helm repo add ialacol https://chenhunghan.github.io/ialacol
+helm repo update
+helm install llama2-13b-chat ialacol/ialacol -f examples/values/llama2-13b-chat.yaml
+```
+
+70B Chat
+
+```sh
+helm repo add ialacol https://chenhunghan.github.io/ialacol
+helm repo update
+helm install llama2-70b-chat ialacol/ialacol -f examples/values/llama2-70b-chat.yaml
+```
+
 ### OpenLM Research's OpenLLaMA Models
 
 Deploy [OpenLLaMA 7B](https://github.com/openlm-research/open_llama) model quantized by [rustformers](https://huggingface.co/rustformers/open-llama-ggml). ℹ️ This is a base model, likely only useful for text completion.
diff --git a/charts/ialacol/Chart.yaml b/charts/ialacol/Chart.yaml
index c54ee55..9c800a8 100644
--- a/charts/ialacol/Chart.yaml
+++ b/charts/ialacol/Chart.yaml
@@ -1,6 +1,6 @@
 apiVersion: v1
-appVersion: 0.3.1
+appVersion: 0.4.1
 description: A Helm chart for ialacol
 name: ialacol
 type: application
-version: 0.4.2
+version: 0.5.2
diff --git a/examples/values/llama2-13b-chat.yaml b/examples/values/llama2-13b-chat.yaml
new file mode 100644
index 0000000..202dbb2
--- /dev/null
+++ b/examples/values/llama2-13b-chat.yaml
@@ -0,0 +1,30 @@
+replicas: 1
+deployment:
+  image: quay.io/chenhunghan/ialacol:latest
+  env:
+    DEFAULT_MODEL_HG_REPO_ID: TheBloke/Llama-2-13B-chat-GGML
+    DEFAULT_MODEL_FILE: llama-2-13b-chat.ggmlv3.q4_0.bin
+    DEFAULT_MODEL_META: ""
+resources:
+  {}
+cache:
+  persistence:
+    size: 15Gi
+    accessModes:
+      - ReadWriteOnce
+    storageClass: ~
+cacheMountPath: /app/cache
+model:
+  persistence:
+    size: 20Gi
+    accessModes:
+      - ReadWriteOnce
+    storageClass: ~
+modelMountPath: /app/models
+service:
+  type: ClusterIP
+  port: 8000
+  annotations: {}
+nodeSelector: {}
+tolerations: []
+affinity: {}
diff --git a/examples/values/llama2-70b-chat.yaml b/examples/values/llama2-70b-chat.yaml
new file mode 100644
index 0000000..c3b036e
--- /dev/null
+++ b/examples/values/llama2-70b-chat.yaml
@@ -0,0 +1,30 @@
+replicas: 1
+deployment:
+  image: quay.io/chenhunghan/ialacol:latest
+  env:
+    DEFAULT_MODEL_HG_REPO_ID: TheBloke/Llama-2-70B-chat-GGML
+    DEFAULT_MODEL_FILE: llama-2-70b-chat.ggmlv3.q4_0.bin
+    DEFAULT_MODEL_META: ""
+resources:
+  {}
+cache:
+  persistence:
+    size: 40Gi
+    accessModes:
+      - ReadWriteOnce
+    storageClass: ~
+cacheMountPath: /app/cache
+model:
+  persistence:
+    size: 40Gi
+    accessModes:
+      - ReadWriteOnce
+    storageClass: ~
+modelMountPath: /app/models
+service:
+  type: ClusterIP
+  port: 8000
+  annotations: {}
+nodeSelector: {}
+tolerations: []
+affinity: {}
diff --git a/examples/values/llama2-7b-chat.yaml b/examples/values/llama2-7b-chat.yaml
new file mode 100644
index 0000000..8182787
--- /dev/null
+++ b/examples/values/llama2-7b-chat.yaml
@@ -0,0 +1,30 @@
+replicas: 1
+deployment:
+  image: quay.io/chenhunghan/ialacol:latest
+  env:
+    DEFAULT_MODEL_HG_REPO_ID: TheBloke/Llama-2-7B-Chat-GGML
+    DEFAULT_MODEL_FILE: llama-2-7b-chat.ggmlv3.q4_0.bin
+    DEFAULT_MODEL_META: ""
+resources:
+  {}
+cache:
+  persistence:
+    size: 5Gi
+    accessModes:
+      - ReadWriteOnce
+    storageClass: ~
+cacheMountPath: /app/cache
+model:
+  persistence:
+    size: 5Gi
+    accessModes:
+      - ReadWriteOnce
+    storageClass: ~
+modelMountPath: /app/models
+service:
+  type: ClusterIP
+  port: 8000
+  annotations: {}
+nodeSelector: {}
+tolerations: []
+affinity: {}
diff --git a/main.py b/main.py
index a9a32be..be336c7 100644
--- a/main.py
+++ b/main.py
@@ -321,6 +321,10 @@ async def chat_completions(
     default_user_end = ""
     default_system = ""
 
+    if "llama" in body.model:
+        default_assistant_start = "ASSISTANT:"
+        default_user_start = "USER: "
+        default_system="SYSTEM: You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."
     # For most instruct fine-tuned models using  Alpaca prompt template
     # Although instruct fine-tuned models are not tuned for chat, they can be to generate response as if chatting, using Alpaca
     # prompt template likely gives better results than using the default prompt template
diff --git a/requirements.txt b/requirements.txt
index a92ba75..04f4ce6 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,7 +3,7 @@ blake3==0.3.3
 certifi==2023.5.7
 charset-normalizer==3.1.0
 click==8.1.3
-ctransformers==0.2.10
+ctransformers==0.2.13
 fastapi==0.95.2
 filelock==3.12.0
 fsspec==2023.5.0
diff --git a/values.yaml b/values.yaml
deleted file mode 100644
index d5b9beb..0000000
--- a/values.yaml
+++ /dev/null
@@ -1,11 +0,0 @@
-replicas: 1
-deployment:
-  image: quay.io/chenhunghan/ialacol:latest
-  env:
-    DEFAULT_MODEL_HG_REPO_ID: TheBloke/orca_mini_3B-GGML
-    DEFAULT_MODEL_FILE: orca-mini-3b.ggmlv3.q4_0.bin
-    DEFAULT_MODEL_META: ""
-service:
-  type: ClusterIP
-  port: 8000
-  annotations: {}