Add support for llama-2, upgrade ctransformer to 0.2.13 (#9)

Signed-off-by: Hung-Han (Henry) Chen <[email protected]>
chenhunghan · Jul 19, 2023 · 7336f19 · 7336f19
1 parent 639411d
commit 7336f19
Show file tree

Hide file tree

Showing 8 changed files with 126 additions and 14 deletions.
diff --git a/README.md b/README.md
@@ -14,6 +14,7 @@ This project is inspired by other similar projects like [LocalAI](https://github
 
 See "Receipts" below for instructions of deployments.
 
+- [LLaMa 2 variants](https://huggingface.co/meta-llama)
 - [OpenLLaMA variants](https://github.com/openlm-research/open_llama)
 - [StarCoder variants](https://huggingface.co/bigcode/starcoder)
 - [WizardCoder](https://huggingface.co/WizardLM/WizardCoder-15B-V1.0)
@@ -90,6 +91,34 @@ print(chat_completion.choices[0].message.content)
 
 ## Receipts
 
+### Llama-2
+
+Deploy [Meta's Llama 2 Chat](https://huggingface.co/meta-llama) model quantized by [TheBloke](https://huggingface.co/TheBloke).
+
+7B Chat
+
+```sh
+helm repo add ialacol https://chenhunghan.github.io/ialacol
+helm repo update
+helm install llama2-7b-chat ialacol/ialacol -f examples/values/llama2-7b-chat.yaml
+```
+
+13B Chat
+
+```sh
+helm repo add ialacol https://chenhunghan.github.io/ialacol
+helm repo update
+helm install llama2-13b-chat ialacol/ialacol -f examples/values/llama2-13b-chat.yaml
+```
+
+70B Chat
+
+```sh
+helm repo add ialacol https://chenhunghan.github.io/ialacol
+helm repo update
+helm install llama2-70b-chat ialacol/ialacol -f examples/values/llama2-70b-chat.yaml
+```
+
 ### OpenLM Research's OpenLLaMA Models
 
 Deploy [OpenLLaMA 7B](https://github.com/openlm-research/open_llama) model quantized by [rustformers](https://huggingface.co/rustformers/open-llama-ggml). ℹ️ This is a base model, likely only useful for text completion.

diff --git a/charts/ialacol/Chart.yaml b/charts/ialacol/Chart.yaml
@@ -1,6 +1,6 @@
 apiVersion: v1
-appVersion: 0.3.1
+appVersion: 0.4.1
 description: A Helm chart for ialacol
 name: ialacol
 type: application
-version: 0.4.2
+version: 0.5.2
diff --git a/examples/values/llama2-13b-chat.yaml b/examples/values/llama2-13b-chat.yaml
@@ -0,0 +1,30 @@
+replicas: 1
+deployment:
+  image: quay.io/chenhunghan/ialacol:latest
+  env:
+    DEFAULT_MODEL_HG_REPO_ID: TheBloke/Llama-2-13B-chat-GGML
+    DEFAULT_MODEL_FILE: llama-2-13b-chat.ggmlv3.q4_0.bin
+    DEFAULT_MODEL_META: ""
+resources:
+  {}
+cache:
+  persistence:
+    size: 15Gi
+    accessModes:
+      - ReadWriteOnce
+    storageClass: ~
+cacheMountPath: /app/cache
+model:
+  persistence:
+    size: 20Gi
+    accessModes:
+      - ReadWriteOnce
+    storageClass: ~
+modelMountPath: /app/models
+service:
+  type: ClusterIP
+  port: 8000
+  annotations: {}
+nodeSelector: {}
+tolerations: []
+affinity: {}
diff --git a/examples/values/llama2-70b-chat.yaml b/examples/values/llama2-70b-chat.yaml
@@ -0,0 +1,30 @@
+replicas: 1
+deployment:
+  image: quay.io/chenhunghan/ialacol:latest
+  env:
+    DEFAULT_MODEL_HG_REPO_ID: TheBloke/Llama-2-70B-chat-GGML
+    DEFAULT_MODEL_FILE: llama-2-70b-chat.ggmlv3.q4_0.bin
+    DEFAULT_MODEL_META: ""
+resources:
+  {}
+cache:
+  persistence:
+    size: 40Gi
+    accessModes:
+      - ReadWriteOnce
+    storageClass: ~
+cacheMountPath: /app/cache
+model:
+  persistence:
+    size: 40Gi
+    accessModes:
+      - ReadWriteOnce
+    storageClass: ~
+modelMountPath: /app/models
+service:
+  type: ClusterIP
+  port: 8000
+  annotations: {}
+nodeSelector: {}
+tolerations: []
+affinity: {}
diff --git a/examples/values/llama2-7b-chat.yaml b/examples/values/llama2-7b-chat.yaml
@@ -0,0 +1,30 @@
+replicas: 1
+deployment:
+  image: quay.io/chenhunghan/ialacol:latest
+  env:
+    DEFAULT_MODEL_HG_REPO_ID: TheBloke/Llama-2-7B-Chat-GGML
+    DEFAULT_MODEL_FILE: llama-2-7b-chat.ggmlv3.q4_0.bin
+    DEFAULT_MODEL_META: ""
+resources:
+  {}
+cache:
+  persistence:
+    size: 5Gi
+    accessModes:
+      - ReadWriteOnce
+    storageClass: ~
+cacheMountPath: /app/cache
+model:
+  persistence:
+    size: 5Gi
+    accessModes:
+      - ReadWriteOnce
+    storageClass: ~
+modelMountPath: /app/models
+service:
+  type: ClusterIP
+  port: 8000
+  annotations: {}
+nodeSelector: {}
+tolerations: []
+affinity: {}
diff --git a/main.py b/main.py
@@ -321,6 +321,10 @@ async def chat_completions(
     default_user_end = ""
     default_system = ""
 
+    if "llama" in body.model:
+        default_assistant_start = "ASSISTANT:"
+        default_user_start = "USER: "
+        default_system="SYSTEM: You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."
     # For most instruct fine-tuned models using  Alpaca prompt template
     # Although instruct fine-tuned models are not tuned for chat, they can be to generate response as if chatting, using Alpaca
     # prompt template likely gives better results than using the default prompt template

diff --git a/requirements.txt b/requirements.txt
@@ -3,7 +3,7 @@ blake3==0.3.3
 certifi==2023.5.7
 charset-normalizer==3.1.0
 click==8.1.3
-ctransformers==0.2.10
+ctransformers==0.2.13
 fastapi==0.95.2
 filelock==3.12.0
 fsspec==2023.5.0

diff --git a/values.yaml b/values.yaml