-
Notifications
You must be signed in to change notification settings - Fork 15
/
setup.py
43 lines (40 loc) · 1.24 KB
/
setup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
from setuptools import find_packages, setup
from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CppExtension
extra_compile_args = {
"cxx": [
"-O3",
"-std=c++17"
],
"nvcc": [
"-O3",
"--use_fast_math",
"-std=c++17",
"-maxrregcount=255",
"--ptxas-options=-v,-warn-lmem-usage,--warn-on-spills",
"-gencode=arch=compute_80,code=sm_80"
],
}
setup(
name="fp6_llm",
author="Haojun Xia, Zhen Zheng, Xiaoxia Wu, Shiyang Chen, Zhewei Yao, Stephen Youn, Arash Bakhtiari, Michael Wyatt, Donglin Zhuang, Zhongzhu Zhou, Olatunji Ruwase, Yuxiong He, Shuaiwen Leon Song",
version="0.2",
author_email="[email protected]",
description ="An efficient GPU support for LLM inference with x-bit quantization (e.g., FP6 and FP5).",
python_requires=">=3.8",
install_requires=[
"torch",
"transformers"
],
packages=find_packages(),
ext_modules=[
CUDAExtension(
name="fp6_llm_cuda",
sources=[
"fp6_llm/csrc/pybind.cpp",
"fp6_llm/csrc/fp6_linear.cu"
],
extra_compile_args=extra_compile_args,
),
],
cmdclass={"build_ext": BuildExtension}
)