moved cuda kernel

Vahe1994 · Feb 6, 2024 · 1278164 · 1278164
1 parent 78cc9a8
commit 1278164
Show file tree

Hide file tree

Showing 6 changed files with 6 additions and 6 deletions.
diff --git a/inference_lib/src/aqlm/__init__.py b/inference_lib/src/aqlm/__init__.py
@@ -1,3 +1,2 @@
-import aqlm.cuda
 import aqlm.inference_kernels
 from aqlm.inference import QuantizedLinear
diff --git a/inference_lib/src/aqlm/cuda/__init__.py b/inference_lib/src/aqlm/cuda/__init__.py
diff --git a/inference_lib/src/aqlm/cuda/cuda_kernel.cpp → ...rc/aqlm/inference_kernels/cuda_kernel.cpp b/inference_lib/src/aqlm/cuda/cuda_kernel.cpp → ...rc/aqlm/inference_kernels/cuda_kernel.cpp
diff --git a/inference_lib/src/aqlm/cuda/cuda_kernel.cu → ...src/aqlm/inference_kernels/cuda_kernel.cu b/inference_lib/src/aqlm/cuda/cuda_kernel.cu → ...src/aqlm/inference_kernels/cuda_kernel.cu
diff --git a/inference_lib/src/aqlm/cuda/cuda_kernel.py → ...src/aqlm/inference_kernels/cuda_kernel.py b/inference_lib/src/aqlm/cuda/cuda_kernel.py → ...src/aqlm/inference_kernels/cuda_kernel.py
diff --git a/inference_lib/src/aqlm/inference_kernels/kernel_selector.py b/inference_lib/src/aqlm/inference_kernels/kernel_selector.py
@@ -5,9 +5,6 @@
 import torch.nn.functional as F
 from aqlm.utils import _dequantize_weight, unpack_int_data
 
-from .numba import numba_gemm_lut
-from .triton_kernel import triton_matmul
-
 
 def forward_pass_quantized_linear(
     input: torch.Tensor,
@@ -19,16 +16,20 @@ def forward_pass_quantized_linear(
     num_codebooks, codebook_size, out_group_size, in_group_size = codebooks.shape
     match (input.is_cuda, num_codebooks, codebook_size, out_group_size, in_group_size):
         case (True, 1, 65536, 1, 8):
-            from aqlm.cuda.cuda_kernel import cuda_gemm_1x16
+            from .cuda_kernel import cuda_gemm_1x16
 
             return cuda_gemm_1x16(input, codes, codebooks, scales, bias)
         case (True, 2, 256, 1, 8):
-            from aqlm.cuda.cuda_kernel import cuda_gemm_2x8
+            from .cuda_kernel import cuda_gemm_2x8
 
             return cuda_gemm_2x8(input, codes, codebooks, scales, bias)
         case (True, _, _, _, _):
+            from .triton_kernel import triton_matmul
+
             return triton_matmul(input, codes, codebooks, scales, bias)
         case (False, _, 256, 1, _):
+            from .numba import numba_gemm_lut
+
             return numba_gemm_lut(input, codes, codebooks, scales, bias)
         case _:
             dequantized_weight = _dequantize_weight(