diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..78264c8
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,34 @@
+FROM node:20.16.0 as web_compile
+WORKDIR /home
+RUN <Installation
-You can install using Pypi:
-```
-pip install ktransformers --no-build-isolation
-```
+1. Use a Docker image, see [documentation for Docker](./doc/en/docker.md)
+2. You can install using Pypi:
-Or download source code and compile:
- - init source code
- ```sh
- git clone https://github.com/kvcache-ai/ktransformers.git
- cd ktransformers
- git submodule init
- git submodule update
- ```
- - [Optional] If you want to run with website, please [compile the website](./doc/en/api/server/website.md) before execute ```bash install.sh```
- - Compile and install
```
- bash install.sh
+ pip install ktransformers --no-build-isolation
```
+3. Or you can download source code and compile:
+ - init source code
+ ```sh
+ git clone https://github.com/kvcache-ai/ktransformers.git
+ cd ktransformers
+ git submodule init
+ git submodule update
+ ```
+ - [Optional] If you want to run with website, please [compile the website](./doc/en/api/server/website.md) before execute ```bash install.sh```
+ - Compile and install
+ ```
+ bash install.sh
+ ```
+
Local Chat
We provide a simple command-line local chat Python script that you can run for testing.
- > Note that this is a very simple test tool only support one round chat without any memory about last input, if you want to try full ability of the model, you may go to [RESTful API and Web UI](#id_666). We use the DeepSeek-V2-Lite-Chat-GGUF model as an example here. But we alse support other models, you can replace it with any other model that you want to test.
+ > Note that this is a very simple test tool only support one round chat without any memory about last input, if you want to try full ability of the model, you may go to [RESTful API and Web UI](#id_666). We use the DeepSeek-V2-Lite-Chat-GGUF model as an example here. But we also support other models, you can replace it with any other model that you want to test.
Run Example
diff --git a/doc/en/Docker.md b/doc/en/Docker.md
new file mode 100644
index 0000000..0fe9616
--- /dev/null
+++ b/doc/en/Docker.md
@@ -0,0 +1,27 @@
+# Docker
+
+## Prerequisites
+* Docker must be installed and running on your system.
+* Create a folder to store big models & intermediate files (ex. /mnt/models)
+
+## Images
+There are Docker images available for our project:
+
+**Uploading**
+
+## Building docker image locally
+ - Download Dockerfile in [there](../../Dockerfile)
+
+ - finish, execute
+ ```bash
+ docker build -t approachingai/ktransformers:v0.1.1 .
+ ```
+
+## Usage
+
+Assuming you have the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-container-toolkit) that you can use the GPU in a Docker container.
+```
+docker run --gpus all -v /path/to/models:/models -p 10002:10002 approachingai/ktransformers:v0.1.1 --port 10002 --gguf_path /models/path/to/gguf_path --model_path /models/path/to/model_path --web True
+```
+
+More operators you can see in the [readme](../../README.md)
\ No newline at end of file
diff --git a/doc/en/deepseek-v2-injection.md b/doc/en/deepseek-v2-injection.md
index 43359cf..c1ccd39 100644
--- a/doc/en/deepseek-v2-injection.md
+++ b/doc/en/deepseek-v2-injection.md
@@ -43,7 +43,11 @@ In the current version of KTransformers, we utilize Marlin for GPU kernels and l
-
+
+
+
### Arithmetic Intensity Guided Offloading
diff --git a/ktransformers/__init__.py b/ktransformers/__init__.py
index a68927d..d1f2e39 100644
--- a/ktransformers/__init__.py
+++ b/ktransformers/__init__.py
@@ -1 +1 @@
-__version__ = "0.1.0"
\ No newline at end of file
+__version__ = "0.1.1"
\ No newline at end of file
diff --git a/ktransformers/ktransformers_ext/CMakeLists.txt b/ktransformers/ktransformers_ext/CMakeLists.txt
index 3b6f54e..02e6a04 100644
--- a/ktransformers/ktransformers_ext/CMakeLists.txt
+++ b/ktransformers/ktransformers_ext/CMakeLists.txt
@@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.16)
project(cpuinfer_ext VERSION 0.1.0)
set(CMAKE_CXX_STANDARD 17)
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -ffast-math")
set(CMAKE_BUILD_TYPE "Release")
include(CheckCXXCompilerFlag)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
@@ -10,6 +10,27 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
option(LLAMA_NATIVE "llama: enable -march=native flag" ON)
+# instruction set specific
+if (LLAMA_NATIVE)
+ set(INS_ENB OFF)
+else()
+ set(INS_ENB ON)
+endif()
+
+option(LLAMA_AVX "llama: enable AVX" OFF)
+option(LLAMA_AVX2 "llama: enable AVX2" OFF)
+option(LLAMA_AVX512 "llama: enable AVX512" OFF)
+option(LLAMA_AVX512_VBMI "llama: enable AVX512-VBMI" OFF)
+option(LLAMA_AVX512_VNNI "llama: enable AVX512-VNNI" OFF)
+option(LLAMA_FMA "llama: enable FMA" OFF)
+# in MSVC F16C is implied with AVX2/AVX512
+if (NOT MSVC)
+ option(LLAMA_F16C "llama: enable F16C" OFF)
+endif()
+option(LLAMA_AVX512_FANCY_SIMD "llama: enable AVX512-VL, AVX512-BW, AVX512-DQ, AVX512-VNNI" OFF)
+option(LLAMA_AVX512_BF16 "llama: enable AVX512-BF16" OFF)
+
+
# Architecture specific
# TODO: probably these flags need to be tweaked on some architectures
# feel free to update the Makefile for your architecture and send a pull request or issue
@@ -102,6 +123,20 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
add_compile_definitions($<$:__AVX512VNNI__>)
add_compile_definitions($<$:__AVX512VNNI__>)
endif()
+ if (LLAMA_AVX512_FANCY_SIMD)
+ add_compile_definitions($<$:__AVX512VL__>)
+ add_compile_definitions($<$:__AVX512VL__>)
+ add_compile_definitions($<$:__AVX512BW__>)
+ add_compile_definitions($<$:__AVX512BW__>)
+ add_compile_definitions($<$:__AVX512DQ__>)
+ add_compile_definitions($<$:__AVX512DQ__>)
+ add_compile_definitions($<$:__AVX512VNNI__>)
+ add_compile_definitions($<$:__AVX512VNNI__>)
+ endif()
+ if (LLAMA_AVX512_BF16)
+ add_compile_definitions($<$:__AVX512BF16__>)
+ add_compile_definitions($<$:__AVX512BF16__>)
+ endif()
elseif (LLAMA_AVX2)
list(APPEND ARCH_FLAGS /arch:AVX2)
elseif (LLAMA_AVX)
@@ -133,6 +168,15 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
if (LLAMA_AVX512_VNNI)
list(APPEND ARCH_FLAGS -mavx512vnni)
endif()
+ if (LLAMA_AVX512_FANCY_SIMD)
+ list(APPEND ARCH_FLAGS -mavx512vl)
+ list(APPEND ARCH_FLAGS -mavx512bw)
+ list(APPEND ARCH_FLAGS -mavx512dq)
+ list(APPEND ARCH_FLAGS -mavx512vnni)
+ endif()
+ if (LLAMA_AVX512_BF16)
+ list(APPEND ARCH_FLAGS -mavx512bf16)
+ endif()
endif()
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
message(STATUS "PowerPC detected")
diff --git a/ktransformers/ktransformers_ext/cmake/FindSIMD.cmake b/ktransformers/ktransformers_ext/cmake/FindSIMD.cmake
new file mode 100644
index 0000000..33377ec
--- /dev/null
+++ b/ktransformers/ktransformers_ext/cmake/FindSIMD.cmake
@@ -0,0 +1,100 @@
+include(CheckCSourceRuns)
+
+set(AVX_CODE "
+ #include
+ int main()
+ {
+ __m256 a;
+ a = _mm256_set1_ps(0);
+ return 0;
+ }
+")
+
+set(AVX512_CODE "
+ #include
+ int main()
+ {
+ __m512i a = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0);
+ __m512i b = a;
+ __mmask64 equality_mask = _mm512_cmp_epi8_mask(a, b, _MM_CMPINT_EQ);
+ return 0;
+ }
+")
+
+set(AVX2_CODE "
+ #include
+ int main()
+ {
+ __m256i a = {0};
+ a = _mm256_abs_epi16(a);
+ __m256i x;
+ _mm256_extract_epi64(x, 0); // we rely on this in our AVX2 code
+ return 0;
+ }
+")
+
+set(FMA_CODE "
+ #include
+ int main()
+ {
+ __m256 acc = _mm256_setzero_ps();
+ const __m256 d = _mm256_setzero_ps();
+ const __m256 p = _mm256_setzero_ps();
+ acc = _mm256_fmadd_ps( d, p, acc );
+ return 0;
+ }
+")
+
+macro(check_sse type flags)
+ set(__FLAG_I 1)
+ set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
+ foreach (__FLAG ${flags})
+ if (NOT ${type}_FOUND)
+ set(CMAKE_REQUIRED_FLAGS ${__FLAG})
+ check_c_source_runs("${${type}_CODE}" HAS_${type}_${__FLAG_I})
+ if (HAS_${type}_${__FLAG_I})
+ set(${type}_FOUND TRUE CACHE BOOL "${type} support")
+ set(${type}_FLAGS "${__FLAG}" CACHE STRING "${type} flags")
+ endif()
+ math(EXPR __FLAG_I "${__FLAG_I}+1")
+ endif()
+ endforeach()
+ set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
+
+ if (NOT ${type}_FOUND)
+ set(${type}_FOUND FALSE CACHE BOOL "${type} support")
+ set(${type}_FLAGS "" CACHE STRING "${type} flags")
+ endif()
+
+ mark_as_advanced(${type}_FOUND ${type}_FLAGS)
+endmacro()
+
+# flags are for MSVC only!
+check_sse("AVX" " ;/arch:AVX")
+if (NOT ${AVX_FOUND})
+ set(LLAMA_AVX OFF)
+else()
+ set(LLAMA_AVX ON)
+endif()
+
+check_sse("AVX2" " ;/arch:AVX2")
+check_sse("FMA" " ;/arch:AVX2")
+if ((NOT ${AVX2_FOUND}) OR (NOT ${FMA_FOUND}))
+ set(LLAMA_AVX2 OFF)
+else()
+ set(LLAMA_AVX2 ON)
+endif()
+
+check_sse("AVX512" " ;/arch:AVX512")
+if (NOT ${AVX512_FOUND})
+ set(LLAMA_AVX512 OFF)
+else()
+ set(LLAMA_AVX512 ON)
+endif()
diff --git a/pyproject.toml b/pyproject.toml
index 0bbef99..8cfe290 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,7 +1,7 @@
[build-system]
requires = [
"setuptools",
- "torch == 2.3.1",
+ "torch >= 2.3.0",
"ninja",
"packaging"
]
@@ -29,7 +29,7 @@ dependencies = [
"fire"
]
-requires-python = ">=3.11"
+requires-python = ">=3.10"
authors = [
{name = "KVCache.AI", email = "zhang.mingxing@outlook.com"}
@@ -50,6 +50,7 @@ keywords = ["ktransformers", "llm"]
classifiers = [
"Development Status :: 4 - Beta",
+ "Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12"
]
diff --git a/setup.py b/setup.py
index 38ee098..1b2d3cf 100644
--- a/setup.py
+++ b/setup.py
@@ -6,7 +6,7 @@
Date : 2024-07-27 16:15:27
Version : 1.0.0
LastEditors : chenxl
-LastEditTime : 2024-07-29 09:40:24
+LastEditTime : 2024-07-31 09:44:46
Adapted from:
https://github.com/Dao-AILab/flash-attention/blob/v2.6.3/setup.py
Copyright (c) 2023, Tri Dao.
@@ -19,6 +19,7 @@
import ast
import subprocess
import platform
+import http.client
import urllib.request
import urllib.error
from pathlib import Path
@@ -28,7 +29,16 @@
from setuptools import setup, Extension
from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME
-
+class CpuInstructInfo:
+ CPU_INSTRUCT = os.getenv("CPU_INSTRUCT", "NATIVE")
+ FANCY = "FANCY"
+ AVX512 = "AVX512"
+ AVX2 = "AVX2"
+ CMAKE_NATIVE = "-DLLAMA_NATIVE=ON"
+ CMAKE_FANCY = "-DLLAMA_NATIVE=OFF -DLLAMA_FMA=ON -DLLAMA_F16C=ON -DLLAMA_AVX=ON -DLLAMA_AVX2=ON -DLLAMA_AVX512=ON -DLLAMA_AVX512_FANCY_SIMD=ON"
+ CMAKE_AVX512 = "-DLLAMA_NATIVE=OFF -DLLAMA_FMA=ON -DLLAMA_F16C=ON -DLLAMA_AVX=ON -DLLAMA_AVX2=ON -DLLAMA_AVX512=ON"
+ CMAKE_AVX2 = "-DLLAMA_NATIVE=OFF -DLLAMA_FMA=ON -DLLAMA_F16C=ON -DLLAMA_AVX=ON -DLLAMA_AVX2=ON"
+
class VersionInfo:
THIS_DIR = os.path.dirname(os.path.abspath(__file__))
PACKAGE_NAME = "ktransformers"
@@ -61,12 +71,24 @@ def get_platform(self,):
raise ValueError("Unsupported platform: {}".format(sys.platform))
def get_cpu_instruct(self,):
+ if CpuInstructInfo.CPU_INSTRUCT == CpuInstructInfo.FANCY:
+ return "fancy"
+ elif CpuInstructInfo.CPU_INSTRUCT == CpuInstructInfo.AVX512:
+ return "avx512"
+ elif CpuInstructInfo.CPU_INSTRUCT == CpuInstructInfo.AVX2:
+ return "avx2"
+ else:
+ print("Using native cpu instruct")
if sys.platform.startswith("linux"):
with open('/proc/cpuinfo', 'r', encoding="utf-8") as cpu_f:
cpuinfo = cpu_f.read()
flags_line = [line for line in cpuinfo.split(
'\n') if line.startswith('flags')][0]
flags = flags_line.split(':')[1].strip().split(' ')
+ # fancy with AVX512-VL, AVX512-BW, AVX512-DQ, AVX512-VNNI
+ for flag in flags:
+ if 'avx512bw' in flag:
+ return 'fancy'
for flag in flags:
if 'avx512' in flag:
return 'avx512'
@@ -116,6 +138,7 @@ def get_wheel_name(self,):
def run(self):
if VersionInfo.FORCE_BUILD:
super().run()
+ return
wheel_filename, wheel_url = self.get_wheel_name()
print("Guessing wheel URL: ", wheel_url)
try:
@@ -132,7 +155,7 @@ def run(self):
wheel_path = os.path.join(self.dist_dir, archive_basename + ".whl")
print("Raw wheel path", wheel_path)
os.rename(wheel_filename, wheel_path)
- except (urllib.error.HTTPError, urllib.error.URLError):
+ except (urllib.error.HTTPError, urllib.error.URLError, http.client.RemoteDisconnected):
print("Precompiled wheel not found. Building from source...")
# If the wheel could not be downloaded, build from source
super().run()
@@ -186,7 +209,19 @@ def build_extension(self, ext) -> None:
if "CMAKE_ARGS" in os.environ:
cmake_args += [
item for item in os.environ["CMAKE_ARGS"].split(" ") if item]
-
+
+ if CpuInstructInfo.CPU_INSTRUCT == CpuInstructInfo.FANCY:
+ cpu_args = CpuInstructInfo.CMAKE_FANCY
+ elif CpuInstructInfo.CPU_INSTRUCT == CpuInstructInfo.AVX512:
+ cpu_args = CpuInstructInfo.CMAKE_AVX512
+ elif CpuInstructInfo.CPU_INSTRUCT == CpuInstructInfo.AVX2:
+ cpu_args = CpuInstructInfo.CMAKE_AVX2
+ else:
+ cpu_args = CpuInstructInfo.CMAKE_NATIVE
+
+ cmake_args += [
+ item for item in cpu_args.split(" ") if item
+ ]
# In this example, we pass in the version to C++. You might not need to.
cmake_args += [
f"-DEXAMPLE_VERSION_INFO={self.distribution.get_version()}"]