Merge pull request #17 from UnicornChan/feature-support-multi-instruct

Feature support multi instruct
kvcache-ai · Aug 1, 2024 · 5e83bc0 · 5e83bc0
2 parents 2562082 + 86ba133
commit 5e83bc0
Show file tree

Hide file tree

Showing 9 changed files with 272 additions and 25 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,34 @@
+FROM node:20.16.0 as web_compile
+WORKDIR /home
+RUN <<EOF
+git clone https://github.com/kvcache-ai/ktransformers.git &&
+cd ktransformers/ktransformers/website/ &&
+npm install @vue/cli &&
+npm run build &&
+rm -rf node_modules
+EOF
+
+
+
+FROM pytorch/pytorch:2.3.1-cuda12.1-cudnn8-devel as compile_server
+WORKDIR /workspace
+COPY --from=web_compile /home/ktransformers /workspace/ktransformers
+RUN <<EOF
+apt update -y &&  apt install -y  --no-install-recommends \
+    git \
+    wget \
+    vim \
+    gcc \
+    g++ \
+    cmake && 
+rm -rf /var/lib/apt/lists/* &&
+cd ktransformers &&
+git submodule init &&
+git submodule update &&
+pip install ninja pyproject numpy &&
+pip install flash-attn &&
+CPU_INSTRUCT=NATIVE  KTRANSFORMERS_FORCE_BUILD=TRUE TORCH_CUDA_ARCH_LIST="8.0;8.6;8.7;8.9" pip install . --no-build-isolation --verbose &&
+pip cache purge
+EOF
+
+ENTRYPOINT [ "/opt/conda/bin/ktransformers" ]
diff --git a/README.md b/README.md
@@ -80,30 +80,32 @@ Some preparation:
   ```
 
 <h3>Installation</h3>
-You can install using Pypi:
 
-```
-pip install ktransformers --no-build-isolation
-```
+1. Use a Docker image, see [documentation for Docker](./doc/en/docker.md) 
+2. You can install using Pypi:
 
-Or download source code and compile:
- - init source code 
-  ```sh
-  git clone https://github.com/kvcache-ai/ktransformers.git
-  cd ktransformers
-  git submodule init
-  git submodule update
-  ```
- - [Optional] If you want to run with website, please [compile the website](./doc/en/api/server/website.md) before execute ```bash install.sh```
- - Compile and install
    ```
-   bash install.sh
+   pip install ktransformers --no-build-isolation
    ```
 
+3. Or you can download source code and compile:
+   - init source code 
+     ```sh
+     git clone https://github.com/kvcache-ai/ktransformers.git
+     cd ktransformers
+     git submodule init
+     git submodule update
+     ```
+   - [Optional] If you want to run with website, please [compile the website](./doc/en/api/server/website.md) before execute ```bash install.sh```
+   - Compile and install
+     ```
+     bash install.sh
+     ```
+
 <h3>Local Chat</h3>
 We provide a simple command-line local chat Python script that you can run for testing. 
 
-  > Note that this is a very simple test tool only support one round chat without any memory about last input, if you want to try full ability of the model, you may go to [RESTful API and Web UI](#id_666). We use the DeepSeek-V2-Lite-Chat-GGUF model as an example here. But we alse support other models, you can replace it with any other model that you want to test. 
+  > Note that this is a very simple test tool only support one round chat without any memory about last input, if you want to try full ability of the model, you may go to [RESTful API and Web UI](#id_666). We use the DeepSeek-V2-Lite-Chat-GGUF model as an example here. But we also support other models, you can replace it with any other model that you want to test. 
 
 
 <h4>Run Example</h4>

diff --git a/doc/en/Docker.md b/doc/en/Docker.md
@@ -0,0 +1,27 @@
+# Docker
+
+## Prerequisites
+* Docker must be installed and running on your system.
+* Create a folder to store big models & intermediate files (ex. /mnt/models)
+
+## Images
+There are Docker images available for our project：
+
+**Uploading**
+
+## Building docker image locally
+ - Download Dockerfile in [there](../../Dockerfile)
+
+ - finish, execute
+   ```bash
+   docker build  -t approachingai/ktransformers:v0.1.1 .
+   ```
+
+## Usage
+
+Assuming you have the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-container-toolkit) that you can use the GPU in a Docker container.
+```
+docker run --gpus all -v /path/to/models:/models -p 10002:10002 approachingai/ktransformers:v0.1.1 --port 10002 --gguf_path /models/path/to/gguf_path --model_path /models/path/to/model_path --web True
+```
+
+More operators you can see in the [readme](../../README.md)
diff --git a/doc/en/deepseek-v2-injection.md b/doc/en/deepseek-v2-injection.md
@@ -43,7 +43,11 @@ In the current version of KTransformers, we utilize Marlin for GPU kernels and l
     <img alt="CPUInfer Performance" src="../assets/cpuinfer.png" width=80%>
   </picture>
 </p>
-
+<p align="center">
+  <picture>
+    <img alt="marlin performance" src="https://github.com/IST-DASLab/marlin/blob/master/assets/sustained.png?raw=true" width=80%>
+  </picture>
+</p>
 
 ### Arithmetic Intensity Guided Offloading
 

diff --git a/ktransformers/__init__.py b/ktransformers/__init__.py
@@ -1 +1 @@
-__version__ = "0.1.0"
+__version__ = "0.1.1"
diff --git a/ktransformers/ktransformers_ext/CMakeLists.txt b/ktransformers/ktransformers_ext/CMakeLists.txt
@@ -2,14 +2,35 @@ cmake_minimum_required(VERSION 3.16)
 project(cpuinfer_ext VERSION 0.1.0)
 
 set(CMAKE_CXX_STANDARD 17)
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -ffast-math")
 set(CMAKE_BUILD_TYPE "Release")
 include(CheckCXXCompilerFlag)
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 
 
 option(LLAMA_NATIVE                     "llama: enable -march=native flag"                      ON)
 
+# instruction set specific
+if (LLAMA_NATIVE)
+    set(INS_ENB OFF)
+else()
+    set(INS_ENB ON)
+endif()
+
+option(LLAMA_AVX                             "llama: enable AVX"                                OFF)
+option(LLAMA_AVX2                            "llama: enable AVX2"                               OFF)
+option(LLAMA_AVX512                          "llama: enable AVX512"                             OFF)
+option(LLAMA_AVX512_VBMI                     "llama: enable AVX512-VBMI"                        OFF)
+option(LLAMA_AVX512_VNNI                     "llama: enable AVX512-VNNI"                        OFF)
+option(LLAMA_FMA                             "llama: enable FMA"                                OFF)
+# in MSVC F16C is implied with AVX2/AVX512
+if (NOT MSVC)
+    option(LLAMA_F16C                        "llama: enable F16C"                               OFF)
+endif()
+option(LLAMA_AVX512_FANCY_SIMD               "llama: enable AVX512-VL, AVX512-BW, AVX512-DQ, AVX512-VNNI"                        OFF)
+option(LLAMA_AVX512_BF16                     "llama: enable AVX512-BF16"                                                         OFF)
+
+
 # Architecture specific
 # TODO: probably these flags need to be tweaked on some architectures
 #       feel free to update the Makefile for your architecture and send a pull request or issue
@@ -102,6 +123,20 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
                 add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
                 add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
             endif()
+            if (LLAMA_AVX512_FANCY_SIMD)
+                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VL__>)
+                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VL__>)
+                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512BW__>)
+                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512BW__>)
+                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512DQ__>)
+                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512DQ__>)
+                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
+                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
+            endif()
+            if (LLAMA_AVX512_BF16)
+                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512BF16__>)
+                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512BF16__>)
+            endif()
         elseif (LLAMA_AVX2)
             list(APPEND ARCH_FLAGS /arch:AVX2)
         elseif (LLAMA_AVX)
@@ -133,6 +168,15 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
         if (LLAMA_AVX512_VNNI)
             list(APPEND ARCH_FLAGS -mavx512vnni)
         endif()
+        if (LLAMA_AVX512_FANCY_SIMD)
+            list(APPEND ARCH_FLAGS -mavx512vl)
+            list(APPEND ARCH_FLAGS -mavx512bw)
+            list(APPEND ARCH_FLAGS -mavx512dq)
+            list(APPEND ARCH_FLAGS -mavx512vnni)
+        endif()
+        if (LLAMA_AVX512_BF16)
+            list(APPEND ARCH_FLAGS -mavx512bf16)
+        endif()
     endif()
 elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
     message(STATUS "PowerPC detected")

diff --git a/ktransformers/ktransformers_ext/cmake/FindSIMD.cmake b/ktransformers/ktransformers_ext/cmake/FindSIMD.cmake
@@ -0,0 +1,100 @@
+include(CheckCSourceRuns)
+
+set(AVX_CODE "
+    #include <immintrin.h>
+    int main()
+    {
+        __m256 a;
+        a = _mm256_set1_ps(0);
+        return 0;
+    }
+")
+
+set(AVX512_CODE "
+    #include <immintrin.h>
+    int main()
+    {
+        __m512i a = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
+                                    0, 0, 0, 0, 0, 0, 0, 0,
+                                    0, 0, 0, 0, 0, 0, 0, 0,
+                                    0, 0, 0, 0, 0, 0, 0, 0,
+                                    0, 0, 0, 0, 0, 0, 0, 0,
+                                    0, 0, 0, 0, 0, 0, 0, 0,
+                                    0, 0, 0, 0, 0, 0, 0, 0,
+                                    0, 0, 0, 0, 0, 0, 0, 0);
+        __m512i b = a;
+        __mmask64 equality_mask = _mm512_cmp_epi8_mask(a, b, _MM_CMPINT_EQ);
+        return 0;
+    }
+")
+
+set(AVX2_CODE "
+    #include <immintrin.h>
+    int main()
+    {
+        __m256i a = {0};
+        a = _mm256_abs_epi16(a);
+        __m256i x;
+        _mm256_extract_epi64(x, 0); // we rely on this in our AVX2 code
+        return 0;
+    }
+")
+
+set(FMA_CODE "
+    #include <immintrin.h>
+    int main()
+    {
+        __m256 acc = _mm256_setzero_ps();
+        const __m256 d = _mm256_setzero_ps();
+        const __m256 p = _mm256_setzero_ps();
+        acc = _mm256_fmadd_ps( d, p, acc );
+        return 0;
+    }
+")
+
+macro(check_sse type flags)
+    set(__FLAG_I 1)
+    set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
+    foreach (__FLAG ${flags})
+        if (NOT ${type}_FOUND)
+            set(CMAKE_REQUIRED_FLAGS ${__FLAG})
+            check_c_source_runs("${${type}_CODE}" HAS_${type}_${__FLAG_I})
+            if (HAS_${type}_${__FLAG_I})
+                set(${type}_FOUND TRUE CACHE BOOL "${type} support")
+                set(${type}_FLAGS "${__FLAG}" CACHE STRING "${type} flags")
+            endif()
+            math(EXPR __FLAG_I "${__FLAG_I}+1")
+        endif()
+    endforeach()
+    set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
+
+    if (NOT ${type}_FOUND)
+        set(${type}_FOUND FALSE CACHE BOOL "${type} support")
+        set(${type}_FLAGS "" CACHE STRING "${type} flags")
+    endif()
+
+    mark_as_advanced(${type}_FOUND ${type}_FLAGS)
+endmacro()
+
+# flags are for MSVC only!
+check_sse("AVX" " ;/arch:AVX")
+if (NOT ${AVX_FOUND})
+    set(LLAMA_AVX OFF)
+else()
+    set(LLAMA_AVX ON)
+endif()
+
+check_sse("AVX2" " ;/arch:AVX2")
+check_sse("FMA" " ;/arch:AVX2")
+if ((NOT ${AVX2_FOUND}) OR (NOT ${FMA_FOUND}))
+    set(LLAMA_AVX2 OFF)
+else()
+    set(LLAMA_AVX2 ON)
+endif()
+
+check_sse("AVX512" " ;/arch:AVX512")
+if (NOT ${AVX512_FOUND})
+    set(LLAMA_AVX512 OFF)
+else()
+    set(LLAMA_AVX512 ON)
+endif()
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,7 +1,7 @@
 [build-system]
 requires = [
   "setuptools",
-  "torch == 2.3.1", 
+  "torch >= 2.3.0", 
   "ninja",
   "packaging"
   ]
@@ -29,7 +29,7 @@ dependencies = [
   "fire"
 ]
 
-requires-python = ">=3.11"
+requires-python = ">=3.10"
 
 authors = [
   {name = "KVCache.AI", email = "[email protected]"}
@@ -50,6 +50,7 @@ keywords = ["ktransformers", "llm"]
 
 classifiers = [
   "Development Status :: 4 - Beta",
+  "Programming Language :: Python :: 3.10",
   "Programming Language :: Python :: 3.11",
   "Programming Language :: Python :: 3.12"
 ]