diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..78264c8 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,34 @@ +FROM node:20.16.0 as web_compile +WORKDIR /home +RUN <Installation -You can install using Pypi: -``` -pip install ktransformers --no-build-isolation -``` +1. Use a Docker image, see [documentation for Docker](./doc/en/docker.md) +2. You can install using Pypi: -Or download source code and compile: - - init source code - ```sh - git clone https://github.com/kvcache-ai/ktransformers.git - cd ktransformers - git submodule init - git submodule update - ``` - - [Optional] If you want to run with website, please [compile the website](./doc/en/api/server/website.md) before execute ```bash install.sh``` - - Compile and install ``` - bash install.sh + pip install ktransformers --no-build-isolation ``` +3. Or you can download source code and compile: + - init source code + ```sh + git clone https://github.com/kvcache-ai/ktransformers.git + cd ktransformers + git submodule init + git submodule update + ``` + - [Optional] If you want to run with website, please [compile the website](./doc/en/api/server/website.md) before execute ```bash install.sh``` + - Compile and install + ``` + bash install.sh + ``` +

Local Chat

We provide a simple command-line local chat Python script that you can run for testing. - > Note that this is a very simple test tool only support one round chat without any memory about last input, if you want to try full ability of the model, you may go to [RESTful API and Web UI](#id_666). We use the DeepSeek-V2-Lite-Chat-GGUF model as an example here. But we alse support other models, you can replace it with any other model that you want to test. + > Note that this is a very simple test tool only support one round chat without any memory about last input, if you want to try full ability of the model, you may go to [RESTful API and Web UI](#id_666). We use the DeepSeek-V2-Lite-Chat-GGUF model as an example here. But we also support other models, you can replace it with any other model that you want to test.

Run Example

diff --git a/doc/en/Docker.md b/doc/en/Docker.md new file mode 100644 index 0000000..0fe9616 --- /dev/null +++ b/doc/en/Docker.md @@ -0,0 +1,27 @@ +# Docker + +## Prerequisites +* Docker must be installed and running on your system. +* Create a folder to store big models & intermediate files (ex. /mnt/models) + +## Images +There are Docker images available for our project: + +**Uploading** + +## Building docker image locally + - Download Dockerfile in [there](../../Dockerfile) + + - finish, execute + ```bash + docker build -t approachingai/ktransformers:v0.1.1 . + ``` + +## Usage + +Assuming you have the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-container-toolkit) that you can use the GPU in a Docker container. +``` +docker run --gpus all -v /path/to/models:/models -p 10002:10002 approachingai/ktransformers:v0.1.1 --port 10002 --gguf_path /models/path/to/gguf_path --model_path /models/path/to/model_path --web True +``` + +More operators you can see in the [readme](../../README.md) \ No newline at end of file diff --git a/doc/en/deepseek-v2-injection.md b/doc/en/deepseek-v2-injection.md index 43359cf..c1ccd39 100644 --- a/doc/en/deepseek-v2-injection.md +++ b/doc/en/deepseek-v2-injection.md @@ -43,7 +43,11 @@ In the current version of KTransformers, we utilize Marlin for GPU kernels and l CPUInfer Performance

- +

+ + marlin performance + +

### Arithmetic Intensity Guided Offloading diff --git a/ktransformers/__init__.py b/ktransformers/__init__.py index a68927d..d1f2e39 100644 --- a/ktransformers/__init__.py +++ b/ktransformers/__init__.py @@ -1 +1 @@ -__version__ = "0.1.0" \ No newline at end of file +__version__ = "0.1.1" \ No newline at end of file diff --git a/ktransformers/ktransformers_ext/CMakeLists.txt b/ktransformers/ktransformers_ext/CMakeLists.txt index 3b6f54e..02e6a04 100644 --- a/ktransformers/ktransformers_ext/CMakeLists.txt +++ b/ktransformers/ktransformers_ext/CMakeLists.txt @@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.16) project(cpuinfer_ext VERSION 0.1.0) set(CMAKE_CXX_STANDARD 17) -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -ffast-math") set(CMAKE_BUILD_TYPE "Release") include(CheckCXXCompilerFlag) set(CMAKE_POSITION_INDEPENDENT_CODE ON) @@ -10,6 +10,27 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON) option(LLAMA_NATIVE "llama: enable -march=native flag" ON) +# instruction set specific +if (LLAMA_NATIVE) + set(INS_ENB OFF) +else() + set(INS_ENB ON) +endif() + +option(LLAMA_AVX "llama: enable AVX" OFF) +option(LLAMA_AVX2 "llama: enable AVX2" OFF) +option(LLAMA_AVX512 "llama: enable AVX512" OFF) +option(LLAMA_AVX512_VBMI "llama: enable AVX512-VBMI" OFF) +option(LLAMA_AVX512_VNNI "llama: enable AVX512-VNNI" OFF) +option(LLAMA_FMA "llama: enable FMA" OFF) +# in MSVC F16C is implied with AVX2/AVX512 +if (NOT MSVC) + option(LLAMA_F16C "llama: enable F16C" OFF) +endif() +option(LLAMA_AVX512_FANCY_SIMD "llama: enable AVX512-VL, AVX512-BW, AVX512-DQ, AVX512-VNNI" OFF) +option(LLAMA_AVX512_BF16 "llama: enable AVX512-BF16" OFF) + + # Architecture specific # TODO: probably these flags need to be tweaked on some architectures # feel free to update the Makefile for your architecture and send a pull request or issue @@ -102,6 +123,20 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW add_compile_definitions($<$:__AVX512VNNI__>) add_compile_definitions($<$:__AVX512VNNI__>) endif() + if (LLAMA_AVX512_FANCY_SIMD) + add_compile_definitions($<$:__AVX512VL__>) + add_compile_definitions($<$:__AVX512VL__>) + add_compile_definitions($<$:__AVX512BW__>) + add_compile_definitions($<$:__AVX512BW__>) + add_compile_definitions($<$:__AVX512DQ__>) + add_compile_definitions($<$:__AVX512DQ__>) + add_compile_definitions($<$:__AVX512VNNI__>) + add_compile_definitions($<$:__AVX512VNNI__>) + endif() + if (LLAMA_AVX512_BF16) + add_compile_definitions($<$:__AVX512BF16__>) + add_compile_definitions($<$:__AVX512BF16__>) + endif() elseif (LLAMA_AVX2) list(APPEND ARCH_FLAGS /arch:AVX2) elseif (LLAMA_AVX) @@ -133,6 +168,15 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW if (LLAMA_AVX512_VNNI) list(APPEND ARCH_FLAGS -mavx512vnni) endif() + if (LLAMA_AVX512_FANCY_SIMD) + list(APPEND ARCH_FLAGS -mavx512vl) + list(APPEND ARCH_FLAGS -mavx512bw) + list(APPEND ARCH_FLAGS -mavx512dq) + list(APPEND ARCH_FLAGS -mavx512vnni) + endif() + if (LLAMA_AVX512_BF16) + list(APPEND ARCH_FLAGS -mavx512bf16) + endif() endif() elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64") message(STATUS "PowerPC detected") diff --git a/ktransformers/ktransformers_ext/cmake/FindSIMD.cmake b/ktransformers/ktransformers_ext/cmake/FindSIMD.cmake new file mode 100644 index 0000000..33377ec --- /dev/null +++ b/ktransformers/ktransformers_ext/cmake/FindSIMD.cmake @@ -0,0 +1,100 @@ +include(CheckCSourceRuns) + +set(AVX_CODE " + #include + int main() + { + __m256 a; + a = _mm256_set1_ps(0); + return 0; + } +") + +set(AVX512_CODE " + #include + int main() + { + __m512i a = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0); + __m512i b = a; + __mmask64 equality_mask = _mm512_cmp_epi8_mask(a, b, _MM_CMPINT_EQ); + return 0; + } +") + +set(AVX2_CODE " + #include + int main() + { + __m256i a = {0}; + a = _mm256_abs_epi16(a); + __m256i x; + _mm256_extract_epi64(x, 0); // we rely on this in our AVX2 code + return 0; + } +") + +set(FMA_CODE " + #include + int main() + { + __m256 acc = _mm256_setzero_ps(); + const __m256 d = _mm256_setzero_ps(); + const __m256 p = _mm256_setzero_ps(); + acc = _mm256_fmadd_ps( d, p, acc ); + return 0; + } +") + +macro(check_sse type flags) + set(__FLAG_I 1) + set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS}) + foreach (__FLAG ${flags}) + if (NOT ${type}_FOUND) + set(CMAKE_REQUIRED_FLAGS ${__FLAG}) + check_c_source_runs("${${type}_CODE}" HAS_${type}_${__FLAG_I}) + if (HAS_${type}_${__FLAG_I}) + set(${type}_FOUND TRUE CACHE BOOL "${type} support") + set(${type}_FLAGS "${__FLAG}" CACHE STRING "${type} flags") + endif() + math(EXPR __FLAG_I "${__FLAG_I}+1") + endif() + endforeach() + set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE}) + + if (NOT ${type}_FOUND) + set(${type}_FOUND FALSE CACHE BOOL "${type} support") + set(${type}_FLAGS "" CACHE STRING "${type} flags") + endif() + + mark_as_advanced(${type}_FOUND ${type}_FLAGS) +endmacro() + +# flags are for MSVC only! +check_sse("AVX" " ;/arch:AVX") +if (NOT ${AVX_FOUND}) + set(LLAMA_AVX OFF) +else() + set(LLAMA_AVX ON) +endif() + +check_sse("AVX2" " ;/arch:AVX2") +check_sse("FMA" " ;/arch:AVX2") +if ((NOT ${AVX2_FOUND}) OR (NOT ${FMA_FOUND})) + set(LLAMA_AVX2 OFF) +else() + set(LLAMA_AVX2 ON) +endif() + +check_sse("AVX512" " ;/arch:AVX512") +if (NOT ${AVX512_FOUND}) + set(LLAMA_AVX512 OFF) +else() + set(LLAMA_AVX512 ON) +endif() diff --git a/pyproject.toml b/pyproject.toml index 0bbef99..8cfe290 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [build-system] requires = [ "setuptools", - "torch == 2.3.1", + "torch >= 2.3.0", "ninja", "packaging" ] @@ -29,7 +29,7 @@ dependencies = [ "fire" ] -requires-python = ">=3.11" +requires-python = ">=3.10" authors = [ {name = "KVCache.AI", email = "zhang.mingxing@outlook.com"} @@ -50,6 +50,7 @@ keywords = ["ktransformers", "llm"] classifiers = [ "Development Status :: 4 - Beta", + "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12" ] diff --git a/setup.py b/setup.py index 38ee098..1b2d3cf 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ Date : 2024-07-27 16:15:27 Version : 1.0.0 LastEditors : chenxl -LastEditTime : 2024-07-29 09:40:24 +LastEditTime : 2024-07-31 09:44:46 Adapted from: https://github.com/Dao-AILab/flash-attention/blob/v2.6.3/setup.py Copyright (c) 2023, Tri Dao. @@ -19,6 +19,7 @@ import ast import subprocess import platform +import http.client import urllib.request import urllib.error from pathlib import Path @@ -28,7 +29,16 @@ from setuptools import setup, Extension from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME - +class CpuInstructInfo: + CPU_INSTRUCT = os.getenv("CPU_INSTRUCT", "NATIVE") + FANCY = "FANCY" + AVX512 = "AVX512" + AVX2 = "AVX2" + CMAKE_NATIVE = "-DLLAMA_NATIVE=ON" + CMAKE_FANCY = "-DLLAMA_NATIVE=OFF -DLLAMA_FMA=ON -DLLAMA_F16C=ON -DLLAMA_AVX=ON -DLLAMA_AVX2=ON -DLLAMA_AVX512=ON -DLLAMA_AVX512_FANCY_SIMD=ON" + CMAKE_AVX512 = "-DLLAMA_NATIVE=OFF -DLLAMA_FMA=ON -DLLAMA_F16C=ON -DLLAMA_AVX=ON -DLLAMA_AVX2=ON -DLLAMA_AVX512=ON" + CMAKE_AVX2 = "-DLLAMA_NATIVE=OFF -DLLAMA_FMA=ON -DLLAMA_F16C=ON -DLLAMA_AVX=ON -DLLAMA_AVX2=ON" + class VersionInfo: THIS_DIR = os.path.dirname(os.path.abspath(__file__)) PACKAGE_NAME = "ktransformers" @@ -61,12 +71,24 @@ def get_platform(self,): raise ValueError("Unsupported platform: {}".format(sys.platform)) def get_cpu_instruct(self,): + if CpuInstructInfo.CPU_INSTRUCT == CpuInstructInfo.FANCY: + return "fancy" + elif CpuInstructInfo.CPU_INSTRUCT == CpuInstructInfo.AVX512: + return "avx512" + elif CpuInstructInfo.CPU_INSTRUCT == CpuInstructInfo.AVX2: + return "avx2" + else: + print("Using native cpu instruct") if sys.platform.startswith("linux"): with open('/proc/cpuinfo', 'r', encoding="utf-8") as cpu_f: cpuinfo = cpu_f.read() flags_line = [line for line in cpuinfo.split( '\n') if line.startswith('flags')][0] flags = flags_line.split(':')[1].strip().split(' ') + # fancy with AVX512-VL, AVX512-BW, AVX512-DQ, AVX512-VNNI + for flag in flags: + if 'avx512bw' in flag: + return 'fancy' for flag in flags: if 'avx512' in flag: return 'avx512' @@ -116,6 +138,7 @@ def get_wheel_name(self,): def run(self): if VersionInfo.FORCE_BUILD: super().run() + return wheel_filename, wheel_url = self.get_wheel_name() print("Guessing wheel URL: ", wheel_url) try: @@ -132,7 +155,7 @@ def run(self): wheel_path = os.path.join(self.dist_dir, archive_basename + ".whl") print("Raw wheel path", wheel_path) os.rename(wheel_filename, wheel_path) - except (urllib.error.HTTPError, urllib.error.URLError): + except (urllib.error.HTTPError, urllib.error.URLError, http.client.RemoteDisconnected): print("Precompiled wheel not found. Building from source...") # If the wheel could not be downloaded, build from source super().run() @@ -186,7 +209,19 @@ def build_extension(self, ext) -> None: if "CMAKE_ARGS" in os.environ: cmake_args += [ item for item in os.environ["CMAKE_ARGS"].split(" ") if item] - + + if CpuInstructInfo.CPU_INSTRUCT == CpuInstructInfo.FANCY: + cpu_args = CpuInstructInfo.CMAKE_FANCY + elif CpuInstructInfo.CPU_INSTRUCT == CpuInstructInfo.AVX512: + cpu_args = CpuInstructInfo.CMAKE_AVX512 + elif CpuInstructInfo.CPU_INSTRUCT == CpuInstructInfo.AVX2: + cpu_args = CpuInstructInfo.CMAKE_AVX2 + else: + cpu_args = CpuInstructInfo.CMAKE_NATIVE + + cmake_args += [ + item for item in cpu_args.split(" ") if item + ] # In this example, we pass in the version to C++. You might not need to. cmake_args += [ f"-DEXAMPLE_VERSION_INFO={self.distribution.get_version()}"]