From 7336f19df0cc97ad173805800648c69e1fa56a22 Mon Sep 17 00:00:00 2001 From: Henry Chen <1474479+chenhunghan@users.noreply.github.com> Date: Wed, 19 Jul 2023 19:20:05 +0300 Subject: [PATCH] Add support for llama-2, upgrade ctransformer to 0.2.13 (#9) Signed-off-by: Hung-Han (Henry) Chen --- README.md | 29 +++++++++++++++++++++++++++ charts/ialacol/Chart.yaml | 4 ++-- examples/values/llama2-13b-chat.yaml | 30 ++++++++++++++++++++++++++++ examples/values/llama2-70b-chat.yaml | 30 ++++++++++++++++++++++++++++ examples/values/llama2-7b-chat.yaml | 30 ++++++++++++++++++++++++++++ main.py | 4 ++++ requirements.txt | 2 +- values.yaml | 11 ---------- 8 files changed, 126 insertions(+), 14 deletions(-) create mode 100644 examples/values/llama2-13b-chat.yaml create mode 100644 examples/values/llama2-70b-chat.yaml create mode 100644 examples/values/llama2-7b-chat.yaml delete mode 100644 values.yaml diff --git a/README.md b/README.md index 337bc95..d95cf1e 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,7 @@ This project is inspired by other similar projects like [LocalAI](https://github See "Receipts" below for instructions of deployments. +- [LLaMa 2 variants](https://huggingface.co/meta-llama) - [OpenLLaMA variants](https://github.com/openlm-research/open_llama) - [StarCoder variants](https://huggingface.co/bigcode/starcoder) - [WizardCoder](https://huggingface.co/WizardLM/WizardCoder-15B-V1.0) @@ -90,6 +91,34 @@ print(chat_completion.choices[0].message.content) ## Receipts +### Llama-2 + +Deploy [Meta's Llama 2 Chat](https://huggingface.co/meta-llama) model quantized by [TheBloke](https://huggingface.co/TheBloke). + +7B Chat + +```sh +helm repo add ialacol https://chenhunghan.github.io/ialacol +helm repo update +helm install llama2-7b-chat ialacol/ialacol -f examples/values/llama2-7b-chat.yaml +``` + +13B Chat + +```sh +helm repo add ialacol https://chenhunghan.github.io/ialacol +helm repo update +helm install llama2-13b-chat ialacol/ialacol -f examples/values/llama2-13b-chat.yaml +``` + +70B Chat + +```sh +helm repo add ialacol https://chenhunghan.github.io/ialacol +helm repo update +helm install llama2-70b-chat ialacol/ialacol -f examples/values/llama2-70b-chat.yaml +``` + ### OpenLM Research's OpenLLaMA Models Deploy [OpenLLaMA 7B](https://github.com/openlm-research/open_llama) model quantized by [rustformers](https://huggingface.co/rustformers/open-llama-ggml). ℹ️ This is a base model, likely only useful for text completion. diff --git a/charts/ialacol/Chart.yaml b/charts/ialacol/Chart.yaml index c54ee55..9c800a8 100644 --- a/charts/ialacol/Chart.yaml +++ b/charts/ialacol/Chart.yaml @@ -1,6 +1,6 @@ apiVersion: v1 -appVersion: 0.3.1 +appVersion: 0.4.1 description: A Helm chart for ialacol name: ialacol type: application -version: 0.4.2 +version: 0.5.2 diff --git a/examples/values/llama2-13b-chat.yaml b/examples/values/llama2-13b-chat.yaml new file mode 100644 index 0000000..202dbb2 --- /dev/null +++ b/examples/values/llama2-13b-chat.yaml @@ -0,0 +1,30 @@ +replicas: 1 +deployment: + image: quay.io/chenhunghan/ialacol:latest + env: + DEFAULT_MODEL_HG_REPO_ID: TheBloke/Llama-2-13B-chat-GGML + DEFAULT_MODEL_FILE: llama-2-13b-chat.ggmlv3.q4_0.bin + DEFAULT_MODEL_META: "" +resources: + {} +cache: + persistence: + size: 15Gi + accessModes: + - ReadWriteOnce + storageClass: ~ +cacheMountPath: /app/cache +model: + persistence: + size: 20Gi + accessModes: + - ReadWriteOnce + storageClass: ~ +modelMountPath: /app/models +service: + type: ClusterIP + port: 8000 + annotations: {} +nodeSelector: {} +tolerations: [] +affinity: {} diff --git a/examples/values/llama2-70b-chat.yaml b/examples/values/llama2-70b-chat.yaml new file mode 100644 index 0000000..c3b036e --- /dev/null +++ b/examples/values/llama2-70b-chat.yaml @@ -0,0 +1,30 @@ +replicas: 1 +deployment: + image: quay.io/chenhunghan/ialacol:latest + env: + DEFAULT_MODEL_HG_REPO_ID: TheBloke/Llama-2-70B-chat-GGML + DEFAULT_MODEL_FILE: llama-2-70b-chat.ggmlv3.q4_0.bin + DEFAULT_MODEL_META: "" +resources: + {} +cache: + persistence: + size: 40Gi + accessModes: + - ReadWriteOnce + storageClass: ~ +cacheMountPath: /app/cache +model: + persistence: + size: 40Gi + accessModes: + - ReadWriteOnce + storageClass: ~ +modelMountPath: /app/models +service: + type: ClusterIP + port: 8000 + annotations: {} +nodeSelector: {} +tolerations: [] +affinity: {} diff --git a/examples/values/llama2-7b-chat.yaml b/examples/values/llama2-7b-chat.yaml new file mode 100644 index 0000000..8182787 --- /dev/null +++ b/examples/values/llama2-7b-chat.yaml @@ -0,0 +1,30 @@ +replicas: 1 +deployment: + image: quay.io/chenhunghan/ialacol:latest + env: + DEFAULT_MODEL_HG_REPO_ID: TheBloke/Llama-2-7B-Chat-GGML + DEFAULT_MODEL_FILE: llama-2-7b-chat.ggmlv3.q4_0.bin + DEFAULT_MODEL_META: "" +resources: + {} +cache: + persistence: + size: 5Gi + accessModes: + - ReadWriteOnce + storageClass: ~ +cacheMountPath: /app/cache +model: + persistence: + size: 5Gi + accessModes: + - ReadWriteOnce + storageClass: ~ +modelMountPath: /app/models +service: + type: ClusterIP + port: 8000 + annotations: {} +nodeSelector: {} +tolerations: [] +affinity: {} diff --git a/main.py b/main.py index a9a32be..be336c7 100644 --- a/main.py +++ b/main.py @@ -321,6 +321,10 @@ async def chat_completions( default_user_end = "" default_system = "" + if "llama" in body.model: + default_assistant_start = "ASSISTANT:" + default_user_start = "USER: " + default_system="SYSTEM: You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information." # For most instruct fine-tuned models using Alpaca prompt template # Although instruct fine-tuned models are not tuned for chat, they can be to generate response as if chatting, using Alpaca # prompt template likely gives better results than using the default prompt template diff --git a/requirements.txt b/requirements.txt index a92ba75..04f4ce6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,7 @@ blake3==0.3.3 certifi==2023.5.7 charset-normalizer==3.1.0 click==8.1.3 -ctransformers==0.2.10 +ctransformers==0.2.13 fastapi==0.95.2 filelock==3.12.0 fsspec==2023.5.0 diff --git a/values.yaml b/values.yaml deleted file mode 100644 index d5b9beb..0000000 --- a/values.yaml +++ /dev/null @@ -1,11 +0,0 @@ -replicas: 1 -deployment: - image: quay.io/chenhunghan/ialacol:latest - env: - DEFAULT_MODEL_HG_REPO_ID: TheBloke/orca_mini_3B-GGML - DEFAULT_MODEL_FILE: orca-mini-3b.ggmlv3.q4_0.bin - DEFAULT_MODEL_META: "" -service: - type: ClusterIP - port: 8000 - annotations: {}