From 88ab9074423fe17510e4879d23baab5a1d7c9eb6 Mon Sep 17 00:00:00 2001
From: PanQiWei <594557445@qq.com>
Date: Wed, 26 Apr 2023 18:33:21 +0800
Subject: [PATCH 01/15] integrate auto-gptq

---
 models/custom_autotune.py | 167 ----------------
 models/modeling_moss.py   |  17 +-
 models/quantization.py    | 393 --------------------------------------
 requirements.txt          |   1 +
 4 files changed, 15 insertions(+), 563 deletions(-)
 delete mode 100644 models/custom_autotune.py
 delete mode 100644 models/quantization.py

diff --git a/models/custom_autotune.py b/models/custom_autotune.py
deleted file mode 100644
index ed8ee24..0000000
--- a/models/custom_autotune.py
+++ /dev/null
@@ -1,167 +0,0 @@
-#https://github.com/fpgaminer/GPTQ-triton
-"""
-Mostly the same as the autotuner in Triton, but with a few changes like using 40 runs instead of 100.
-"""
-
-import builtins
-import math
-import time
-from typing import Dict
-
-import triton
-
-
-class Autotuner(triton.KernelInterface):
-	def __init__(self, fn, arg_names, configs, key, reset_to_zero, prune_configs_by: Dict = None, nearest_power_of_two: bool = False):
-		'''
-		:param prune_configs_by: a dict of functions that are used to prune configs, fields:
-			'perf_model': performance model used to predicate running time with different configs, returns running time
-			'top_k': number of configs to bench
-			'prune_num_stages_by'(optional): a function used to prune num_stages. It take configs:List[Config] as its input, and returns pruned configs.
-			'nearest_power_of_two'(optional): whether to round key arguments to the nearest power of two when caching tuning results
-		'''
-		if not configs:
-			self.configs = [triton.Config({}, num_warps=4, num_stages=2)]
-		else:
-			self.configs = configs
-		self.key_idx = [arg_names.index(k) for k in key]
-		self.nearest_power_of_two = nearest_power_of_two
-		self.cache = {}
-		# hook to reset all required tensor to zeros before relaunching a kernel
-		self.hook = lambda args: 0
-		if reset_to_zero is not None:
-			self.reset_idx = [arg_names.index(k) for k in reset_to_zero]
-
-			def _hook(args):
-				for i in self.reset_idx:
-					args[i].zero_()
-			self.hook = _hook
-		self.arg_names = arg_names
-		# prune configs
-		if prune_configs_by:
-			perf_model, top_k = prune_configs_by['perf_model'], prune_configs_by['top_k']
-			if 'early_config_prune' in prune_configs_by:
-				early_config_prune = prune_configs_by['early_config_prune']
-		else:
-			perf_model, top_k, early_config_prune = None, None, None
-		self.perf_model, self.configs_top_k = perf_model, top_k
-		self.early_config_prune = early_config_prune
-		self.fn = fn
-
-	def _bench(self, *args, config, **meta):
-		# check for conflicts, i.e. meta-parameters both provided
-		# as kwargs and by the autotuner
-		conflicts = meta.keys() & config.kwargs.keys()
-		if conflicts:
-			raise ValueError(
-				f"Conflicting meta-parameters: {', '.join(conflicts)}."
-				" Make sure that you don't re-define auto-tuned symbols."
-			)
-		# augment meta-parameters with tunable ones
-		current = dict(meta, **config.kwargs)
-
-		def kernel_call():
-			if config.pre_hook:
-				config.pre_hook(self.nargs)
-			self.hook(args)
-			self.fn.run(*args, num_warps=config.num_warps, num_stages=config.num_stages, **current)
-		try:
-			# In testings using only 40 reps seems to be close enough and it appears to be what PyTorch uses
-			# PyTorch also sets fast_flush to True, but I didn't see any speedup so I'll leave the default
-			return triton.testing.do_bench(kernel_call, rep=40)
-		except triton.compiler.OutOfResources:
-			return float('inf')
-
-	def run(self, *args, **kwargs):
-		self.nargs = dict(zip(self.arg_names, args))
-		if len(self.configs) > 1:
-			key = tuple(args[i] for i in self.key_idx)
-
-			# This reduces the amount of autotuning by rounding the keys to the nearest power of two
-			# In my testing this gives decent results, and greatly reduces the amount of tuning required
-			if self.nearest_power_of_two:
-				key = tuple([2 ** int(math.log2(x) + 0.5) for x in key])
-			
-			if key not in self.cache:
-				# prune configs
-				pruned_configs = self.prune_configs(kwargs)
-				bench_start = time.time()
-				timings = {config: self._bench(*args, config=config, **kwargs)
-							for config in pruned_configs}
-				bench_end = time.time()
-				self.bench_time = bench_end - bench_start
-				self.cache[key] = builtins.min(timings, key=timings.get)
-				self.hook(args)
-				self.configs_timings = timings
-			config = self.cache[key]
-		else:
-			config = self.configs[0]
-		self.best_config = config
-		if config.pre_hook is not None:
-			config.pre_hook(self.nargs)
-		return self.fn.run(*args, num_warps=config.num_warps, num_stages=config.num_stages, **kwargs, **config.kwargs)
-
-	def prune_configs(self, kwargs):
-		pruned_configs = self.configs
-		if self.early_config_prune:
-			pruned_configs = self.early_config_prune(self.configs, self.nargs)
-		if self.perf_model:
-			top_k = self.configs_top_k
-			if isinstance(top_k, float) and top_k <= 1.0:
-				top_k = int(len(self.configs) * top_k)
-			if len(pruned_configs) > top_k:
-				est_timing = {
-					config: self.perf_model(**self.nargs, **kwargs, **config.kwargs, num_stages=config.num_stages,
-											num_warps=config.num_warps)
-					for config in pruned_configs
-				}
-				pruned_configs = sorted(est_timing.keys(), key=lambda x: est_timing[x])[:top_k]
-		return pruned_configs
-
-	def warmup(self, *args, **kwargs):
-		self.nargs = dict(zip(self.arg_names, args))
-		for config in self.prune_configs(kwargs):
-			self.fn.warmup(
-				*args,
-				num_warps=config.num_warps,
-				num_stages=config.num_stages,
-				**kwargs,
-				**config.kwargs,
-			)
-		self.nargs = None
-
-
-def autotune(configs, key, prune_configs_by=None, reset_to_zero=None, nearest_power_of_two=False):
-	"""
-	Decorator for auto-tuning a :code:`triton.jit`'d function.
-	.. highlight:: python
-	.. code-block:: python
-		@triton.autotune(configs=[
-			triton.Config(meta={'BLOCK_SIZE': 128}, num_warps=4),
-			triton.Config(meta={'BLOCK_SIZE': 1024}, num_warps=8),
-			],
-			key=['x_size'] # the two above configs will be evaluated anytime
-							# the value of x_size changes
-		)
-		@triton.jit
-		def kernel(x_ptr, x_size, **META):
-			BLOCK_SIZE = META['BLOCK_SIZE']
-	:note: When all the configurations are evaluated, the kernel will run multiple time.
-			This means that whatever value the kernel updates will be updated multiple times.
-			To avoid this undesired behavior, you can use the `reset_to_zero` argument, which
-			reset the value of the provided tensor to `zero` before running any configuration.
-	:param configs: a list of :code:`triton.Config` objects
-	:type configs: list[triton.Config]
-	:param key: a list of argument names whose change in value will trigger the evaluation of all provided configs.
-	:type key: list[str]
-	:param prune_configs_by: a dict of functions that are used to prune configs, fields:
-		'perf_model': performance model used to predicate running time with different configs, returns running time
-		'top_k': number of configs to bench
-		'early_config_prune'(optional): a function used to do early prune (eg, num_stages). It take configs:List[Config] as its input, and returns pruned configs.
-	:param reset_to_zero: a list of argument names whose value will be reset to zero before evaluating any configs.
-	:type reset_to_zero: list[str]
-	"""
-	def decorator(fn):
-		return Autotuner(fn, fn.arg_names, configs, key, reset_to_zero, prune_configs_by, nearest_power_of_two)
-
-	return decorator
diff --git a/models/modeling_moss.py b/models/modeling_moss.py
index dddfe2f..f8e0677 100644
--- a/models/modeling_moss.py
+++ b/models/modeling_moss.py
@@ -733,6 +733,17 @@ def _reorder_cache(
         )
 
     def quantize(self, wbits, groupsize):
-        from .quantization import quantize_with_gptq
-        return quantize_with_gptq(self, wbits, groupsize)
-
+        from auto_gptq.modeling._utils import make_quant, find_layers
+        try:
+            import triton
+            use_triton = True
+        except ImportError:
+            use_triton = False
+
+        layers = find_layers(self)
+        for name in ["lm_head"]:
+            if name in layers:
+                del layers[name]
+        make_quant(self, layers, wbits, groupsize, use_triton=use_triton)
+
+        return self
diff --git a/models/quantization.py b/models/quantization.py
deleted file mode 100644
index 1633a0c..0000000
--- a/models/quantization.py
+++ /dev/null
@@ -1,393 +0,0 @@
-import numpy as np
-import torch
-import torch.nn as nn
-from torch.cuda.amp import custom_bwd, custom_fwd
-import math
-import triton
-import triton.language as tl
-from models.custom_autotune import *
-
-
-def find_layers(module, layers=[nn.Conv2d, nn.Linear], name=''):
-    if type(module) in layers:
-        return {name: module}
-    res = {}
-    for name1, child in module.named_children():
-        res.update(find_layers(
-            child, layers=layers, name=name + '.' + name1 if name != '' else name1
-        ))
-    return res
-
-
-# code based https://github.com/fpgaminer/GPTQ-triton
-@autotune(
-    configs=[
-        triton.Config({'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8},
-                      num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8},
-                      num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8},
-                      num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8},
-                      num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8},
-                      num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8},
-                      num_stages=4, num_warps=4),
-        # These provided a benefit on a 3090
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4,
-                      num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4,
-                      num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4,
-                      num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_stages=4,
-                      num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_stages=4,
-                      num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_stages=4,
-                      num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 8},
-                      num_stages=4, num_warps=4),
-    ],
-    key=['M', 'N'],
-    nearest_power_of_two=True,
-)
-@triton.jit
-def matmul_248_kernel(a_ptr, b_ptr, c_ptr,
-                      scales_ptr, zeros_ptr, g_ptr,
-                      M, N, K, bits, maxq,
-                      stride_am, stride_ak,
-                      stride_bk, stride_bn,
-                      stride_cm, stride_cn,
-                      stride_scales, stride_zeros,
-                      BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
-                      GROUP_SIZE_M: tl.constexpr):
-    """
-    Compute the matrix multiplication C = A x B.
-    A is of shape (M, K) float16
-    B is of shape (K//8, N) int32
-    C is of shape (M, N) float16
-    scales is of shape (G, N) float16
-    zeros is of shape (G, N) float16
-    g_ptr is of shape (K) int32
-    """
-    infearure_per_bits = 32 // bits
-
-    pid = tl.program_id(axis=0)
-    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
-    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
-    num_pid_k = tl.cdiv(K, BLOCK_SIZE_K)
-    num_pid_in_group = GROUP_SIZE_M * num_pid_n
-    group_id = pid // num_pid_in_group
-    first_pid_m = group_id * GROUP_SIZE_M
-    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-    pid_m = first_pid_m + (pid % group_size_m)
-    pid_n = (pid % num_pid_in_group) // group_size_m
-
-    offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    offs_k = tl.arange(0, BLOCK_SIZE_K)
-    a_ptrs = a_ptr + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak)  # (BLOCK_SIZE_M, BLOCK_SIZE_K)
-    a_mask = (offs_am[:, None] < M)
-    # b_ptrs is set up such that it repeats elements along the K axis 8 times
-    b_ptrs = b_ptr + ((offs_k[:, None] // infearure_per_bits) * stride_bk + offs_bn[None,
-                                                                            :] * stride_bn)  # (BLOCK_SIZE_K, BLOCK_SIZE_N)
-    g_ptrs = g_ptr + offs_k
-    # shifter is used to extract the N bits of each element in the 32-bit word from B
-    scales_ptrs = scales_ptr + offs_bn[None, :]
-    zeros_ptrs = zeros_ptr + (offs_bn[None, :] // infearure_per_bits)
-
-    shifter = (offs_k % infearure_per_bits) * bits
-    zeros_shifter = (offs_bn % infearure_per_bits) * bits
-    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-
-    for k in range(0, num_pid_k):
-        g_idx = tl.load(g_ptrs)
-
-        # Fetch scales and zeros; these are per-outfeature and thus reused in the inner loop
-        scales = tl.load(scales_ptrs + g_idx[:, None] * stride_scales)  # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
-        zeros = tl.load(zeros_ptrs + g_idx[:, None] * stride_zeros)  # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
-
-        zeros = (zeros >> zeros_shifter[None, :]) & maxq
-        zeros = (zeros + 1)
-
-        a = tl.load(a_ptrs, mask=a_mask, other=0.)  # (BLOCK_SIZE_M, BLOCK_SIZE_K)
-        b = tl.load(b_ptrs)  # (BLOCK_SIZE_K, BLOCK_SIZE_N), but repeated
-
-        # Now we need to unpack b (which is N-bit values) into 32-bit values
-        b = (b >> shifter[:, None]) & maxq  # Extract the N-bit values
-        b = (b - zeros) * scales  # Scale and shift
-
-        accumulator += tl.dot(a, b)
-        a_ptrs += BLOCK_SIZE_K
-        b_ptrs += (BLOCK_SIZE_K // infearure_per_bits) * stride_bk
-        g_ptrs += BLOCK_SIZE_K
-
-    c = accumulator.to(tl.float16)
-    c_ptrs = c_ptr + stride_cm * offs_am[:, None] + stride_cn * offs_bn[None, :]
-    c_mask = (offs_am[:, None] < M) & (offs_bn[None, :] < N)
-    tl.store(c_ptrs, accumulator, mask=c_mask)
-
-
-# code based https://github.com/fpgaminer/GPTQ-triton
-@autotune(
-    configs=[
-        triton.Config({'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_K': 64, 'BLOCK_SIZE_N': 32, 'GROUP_SIZE_M': 8},
-                      num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_K': 256, 'BLOCK_SIZE_N': 32, 'GROUP_SIZE_M': 8},
-                      num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_K': 128, 'BLOCK_SIZE_N': 32, 'GROUP_SIZE_M': 8},
-                      num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_K': 64, 'BLOCK_SIZE_N': 32, 'GROUP_SIZE_M': 8},
-                      num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_K': 128, 'BLOCK_SIZE_N': 32, 'GROUP_SIZE_M': 8},
-                      num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_K': 32, 'BLOCK_SIZE_N': 32, 'GROUP_SIZE_M': 8},
-                      num_stages=4, num_warps=4),
-        # These provided a benefit on a 3090
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_K': 64, 'BLOCK_SIZE_N': 32, 'GROUP_SIZE_M': 8}, num_stages=4,
-                      num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_K': 64, 'BLOCK_SIZE_N': 32, 'GROUP_SIZE_M': 8}, num_stages=4,
-                      num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_K': 32, 'BLOCK_SIZE_N': 32, 'GROUP_SIZE_M': 8}, num_stages=4,
-                      num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_K': 64, 'BLOCK_SIZE_N': 64, 'GROUP_SIZE_M': 8}, num_stages=4,
-                      num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_K': 64, 'BLOCK_SIZE_N': 64, 'GROUP_SIZE_M': 8}, num_stages=4,
-                      num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_K': 32, 'BLOCK_SIZE_N': 64, 'GROUP_SIZE_M': 8}, num_stages=4,
-                      num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_K': 64, 'BLOCK_SIZE_N': 128, 'GROUP_SIZE_M': 8},
-                      num_stages=4, num_warps=4),
-    ],
-    key=['M', 'K'],
-    nearest_power_of_two=True,
-)
-@triton.jit
-def trans_matmul_248_kernel(a_ptr, b_ptr, c_ptr,
-                            scales_ptr, zeros_ptr, g_ptr,
-                            M, N, K, bits, maxq,
-                            stride_am, stride_ak,
-                            stride_bk, stride_bn,
-                            stride_cm, stride_cn,
-                            stride_scales, stride_zeros,
-                            BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
-                            GROUP_SIZE_M: tl.constexpr):
-    """
-    Compute the matrix multiplication C = A x B.
-    A is of shape (M, N) float16
-    B is of shape (K//8, N) int32
-    C is of shape (M, K) float16
-    scales is of shape (G, N) float16
-    zeros is of shape (G, N) float16
-    g_ptr is of shape (K) int32
-    """
-    infearure_per_bits = 32 // bits
-
-    pid = tl.program_id(axis=0)
-    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
-    num_pid_k = tl.cdiv(K, BLOCK_SIZE_K)
-    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
-    num_pid_in_group = GROUP_SIZE_M * num_pid_k
-    group_id = pid // num_pid_in_group
-    first_pid_m = group_id * GROUP_SIZE_M
-    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-    pid_m = first_pid_m + (pid % group_size_m)
-    pid_k = (pid % num_pid_in_group) // group_size_m
-
-    offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_bk = pid_k * BLOCK_SIZE_K + tl.arange(0, BLOCK_SIZE_K)
-    offs_n = tl.arange(0, BLOCK_SIZE_N)
-    a_ptrs = a_ptr + (offs_am[:, None] * stride_am + offs_n[None, :] * stride_ak)  # (BLOCK_SIZE_M, BLOCK_SIZE_N)
-    a_mask = (offs_am[:, None] < M)
-    # b_ptrs is set up such that it repeats elements along the K axis 8 times
-    b_ptrs = b_ptr + ((offs_bk[:, None] // infearure_per_bits) * stride_bk + offs_n[None,
-                                                                             :] * stride_bn)  # (BLOCK_SIZE_K, BLOCK_SIZE_N)
-    g_ptrs = g_ptr + offs_bk
-    g_idx = tl.load(g_ptrs)
-
-    # shifter is used to extract the N bits of each element in the 32-bit word from B
-    scales_ptrs = scales_ptr + offs_n[None, :] + g_idx[:, None] * stride_scales
-    zeros_ptrs = zeros_ptr + (offs_n[None, :] // infearure_per_bits) + g_idx[:, None] * stride_zeros
-
-    shifter = (offs_bk % infearure_per_bits) * bits
-    zeros_shifter = (offs_n % infearure_per_bits) * bits
-    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_K), dtype=tl.float32)
-
-    for k in range(0, num_pid_n):
-        # Fetch scales and zeros; these are per-outfeature and thus reused in the inner loop
-        scales = tl.load(scales_ptrs)  # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
-        zeros = tl.load(zeros_ptrs)  # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
-
-        zeros = (zeros >> zeros_shifter[None, :]) & maxq
-        zeros = (zeros + 1)
-
-        a = tl.load(a_ptrs, mask=a_mask, other=0.)  # (BLOCK_SIZE_M, BLOCK_SIZE_N)
-        b = tl.load(b_ptrs)  # (BLOCK_SIZE_K, BLOCK_SIZE_N), but repeated
-
-        # Now we need to unpack b (which is N-bit values) into 32-bit values
-        b = (b >> shifter[:, None]) & maxq  # Extract the N-bit values
-        b = (b - zeros) * scales  # Scale and shift
-        b = tl.trans(b)
-
-        accumulator += tl.dot(a, b)
-        a_ptrs += BLOCK_SIZE_N
-        b_ptrs += BLOCK_SIZE_N
-        scales_ptrs += BLOCK_SIZE_N
-        zeros_ptrs += (BLOCK_SIZE_N // infearure_per_bits)
-
-    c = accumulator.to(tl.float16)
-    c_ptrs = c_ptr + stride_cm * offs_am[:, None] + stride_cn * offs_bk[None, :]
-    c_mask = (offs_am[:, None] < M) & (offs_bk[None, :] < K)
-    tl.store(c_ptrs, accumulator, mask=c_mask)
-
-
-def matmul248(input, qweight, scales, qzeros, g_idx, bits, maxq):
-    output = torch.empty((input.shape[0], qweight.shape[1]), device='cuda', dtype=torch.float16)
-    grid = lambda META: (
-    triton.cdiv(input.shape[0], META['BLOCK_SIZE_M']) * triton.cdiv(qweight.shape[1], META['BLOCK_SIZE_N']),)
-    matmul_248_kernel[grid](input, qweight, output,
-                            scales, qzeros, g_idx,
-                            input.shape[0], qweight.shape[1], input.shape[1], bits, maxq,
-                            input.stride(0), input.stride(1),
-                            qweight.stride(0), qweight.stride(1),
-                            output.stride(0), output.stride(1),
-                            scales.stride(0), qzeros.stride(0))
-    return output
-
-
-def transpose_matmul248(input, qweight, scales, qzeros, g_idx, bits, maxq):
-    output_dim = (qweight.shape[0] * 32) // bits
-    output = torch.empty((input.shape[0], output_dim), device='cuda', dtype=torch.float16)
-    grid = lambda META: (
-    triton.cdiv(input.shape[0], META['BLOCK_SIZE_M']) * triton.cdiv(output_dim, META['BLOCK_SIZE_K']),)
-    transpose_matmul_248_kernel[grid](input, qweight, output,
-                                      scales, qzeros, g_idx,
-                                      input.shape[0], qweight.shape[1], output_dim, bits, maxq,
-                                      input.stride(0), input.stride(1),
-                                      qweight.stride(0), qweight.stride(1),
-                                      output.stride(0), output.stride(1),
-                                      scales.stride(0), qzeros.stride(0))
-    return output
-
-
-class QuantLinearFunction(torch.autograd.Function):
-    @staticmethod
-    @custom_fwd(cast_inputs=torch.float16)
-    def forward(ctx, input, qweight, scales, qzeros, g_idx, bits, maxq):
-        output = matmul248(input, qweight, scales, qzeros, g_idx, bits, maxq)
-        ctx.save_for_backward(qweight, scales, qzeros, g_idx)
-        ctx.bits, ctx.maxq = bits, maxq
-        return output
-
-    @staticmethod
-    @custom_bwd
-    def backward(ctx, grad_output):
-        qweight, scales, qzeros, g_idx = ctx.saved_tensors
-        bits, maxq = ctx.bits, ctx.maxq
-        grad_input = None
-
-        if ctx.needs_input_grad[0]:
-            grad_input = transpose_matmul248(grad_output, qweight, scales, qzeros, g_idx, bits, maxq)
-        return grad_input, None, None, None, None, None, None
-
-class QuantLinear(nn.Module):
-    def __init__(self, bits, groupsize, infeatures, outfeatures, bias):
-        super().__init__()
-        if bits not in [2, 4, 8]:
-            raise NotImplementedError("Only 2,4,8 bits are supported.")
-        self.infeatures = infeatures
-        self.outfeatures = outfeatures
-        self.bits = bits
-        self.maxq = 2 ** self.bits - 1
-        self.groupsize = groupsize if groupsize != -1 else infeatures
-
-        self.register_buffer('qweight', torch.zeros((infeatures // 32 * self.bits, outfeatures), dtype=torch.int32))
-        self.register_buffer('qzeros', torch.zeros((math.ceil(infeatures / self.groupsize), outfeatures // 32 * self.bits), dtype=torch.int32))
-        self.register_buffer('scales', torch.zeros((math.ceil(infeatures / self.groupsize), outfeatures), dtype=torch.float16))
-        self.register_buffer('g_idx', torch.tensor([i // self.groupsize for i in range(infeatures)], dtype=torch.int32))
-        if bias:
-            self.register_buffer('bias', torch.zeros((outfeatures), dtype=torch.float16))
-        else:
-            self.bias = None
-
-    def pack(self, linear, scales, zeros, g_idx=None):
-        self.g_idx = g_idx.clone() if g_idx is not None else self.g_idx
-
-        scales = scales.t().contiguous()
-        zeros = zeros.t().contiguous()
-        scale_zeros = zeros * scales
-        self.scales = scales.clone().half()
-        if linear.bias is not None:
-            self.bias = linear.bias.clone().half()
-
-        intweight = []
-        for idx in range(self.infeatures):
-            intweight.append(torch.round(
-                (linear.weight.data[:, idx] + scale_zeros[self.g_idx[idx]]) / self.scales[self.g_idx[idx]]).to(
-                torch.int)[:, None])
-        intweight = torch.cat(intweight, dim=1)
-        intweight = intweight.t().contiguous()
-        intweight = intweight.numpy().astype(np.uint32)
-        qweight = np.zeros((intweight.shape[0] // 32 * self.bits, intweight.shape[1]), dtype=np.uint32)
-        i = 0
-        row = 0
-        while row < qweight.shape[0]:
-            if self.bits in [2, 4, 8]:
-                for j in range(i, i + (32 // self.bits)):
-                    qweight[row] |= intweight[j] << (self.bits * (j - i))
-                i += 32 // self.bits
-                row += 1
-            else:
-                raise NotImplementedError("Only 2,4,8 bits are supported.")
-
-        qweight = qweight.astype(np.int32)
-        self.qweight = torch.from_numpy(qweight)
-
-        zeros -= 1
-        zeros = zeros.numpy().astype(np.uint32)
-        qzeros = np.zeros((zeros.shape[0], zeros.shape[1] // 32 * self.bits), dtype=np.uint32)
-        i = 0
-        col = 0
-        while col < qzeros.shape[1]:
-            if self.bits in [2, 4, 8]:
-                for j in range(i, i + (32 // self.bits)):
-                    qzeros[:, col] |= zeros[:, j] << (self.bits * (j - i))
-                i += 32 // self.bits
-                col += 1
-            else:
-                raise NotImplementedError("Only 2,4,8 bits are supported.")
-
-        qzeros = qzeros.astype(np.int32)
-        self.qzeros = torch.from_numpy(qzeros)
-
-    def forward(self, x):
-        out_shape = x.shape[:-1] + (self.outfeatures,)
-        out = QuantLinearFunction.apply(x.reshape(-1, x.shape[-1]), self.qweight, self.scales,
-                                        self.qzeros, self.g_idx, self.bits, self.maxq)
-        out = out + self.bias if self.bias is not None else out
-        return out.reshape(out_shape)
-
-def make_quant(module, names, bits, groupsize, name=''):
-    if isinstance(module, QuantLinear):
-        return
-    for attr in dir(module):
-        tmp = getattr(module, attr)
-        name1 = name + '.' + attr if name != '' else attr
-        if name1 in names:
-            delattr(module, attr)
-            setattr(module, attr, QuantLinear(bits, groupsize, tmp.in_features, tmp.out_features, tmp.bias is not None))
-    for name1, child in module.named_children():
-        make_quant(child, names, bits, groupsize, name + '.' + name1 if name != '' else name1)
-
-
-def quantize_with_gptq(model, wbits, groupsize):
-    model = model.eval()
-    layers = find_layers(model)
-    for name in ['lm_head']:
-        if name in layers:
-            del layers[name]
-    make_quant(model, layers, wbits, groupsize)
-    # model.load_state_dict(torch.load(checkpoint))
-    return model
diff --git a/requirements.txt b/requirements.txt
index d92c6d9..c270e3e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,3 +6,4 @@ accelerate
 matplotlib
 huggingface_hub
 gradio
+"auto-gptq -i https://pypi.org/simple"

From b24970741985ad24b7a82060edf1e8099d91fe33 Mon Sep 17 00:00:00 2001
From: PanQiWei <594557445@qq.com>
Date: Wed, 26 Apr 2023 19:45:33 +0800
Subject: [PATCH 02/15] update requirements.txt

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 274b875..6ca9176 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,5 +6,5 @@ accelerate
 matplotlib
 huggingface_hub
 gradio
-"auto-gptq -i https://pypi.org/simple"
+auto-gptq
 mdtex2html

From a6c8b77c5853762f475a2408fc8ad3b160803102 Mon Sep 17 00:00:00 2001
From: PanQiWei <594557445@qq.com>
Date: Wed, 26 Apr 2023 19:49:25 +0800
Subject: [PATCH 03/15] add simple script to automatic create conda environment
 and install dependencies

---
 setup_env.py | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)
 create mode 100644 setup_env.py

diff --git a/setup_env.py b/setup_env.py
new file mode 100644
index 0000000..09cac8a
--- /dev/null
+++ b/setup_env.py
@@ -0,0 +1,51 @@
+import subprocess
+from argparse import ArgumentParser
+
+"""WARNING: this scripts may only works on linux"""
+
+
+pip_dependencies = [
+    # change version based on your situation
+    "torch==1.13.1+cu116 torchvision==0.14.1+cu116 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu116",
+    "transformers==4.25.1",
+    "sentencepiece",
+    "datasets",
+    "accelerate",
+    "matplotlib",
+    "huggingface_hub",
+    "gradio",
+    "auto-gptq -i https://pypi.org/simple",
+    "mdtex2html"
+]
+
+
+def setup_env():
+    parser = ArgumentParser()
+    parser.add_argument("--conda_home", type=str, default="/root/miniconda3/bin")
+    parser.add_argument("--init_conda", action="store_true")
+    parser.add_argument("--conda_name", type=str, default="moss")
+    parser.add_argument("--python_version", type=str, default="3.10")
+    args = parser.parse_args()
+
+    if args.init_conda:
+        print(
+            subprocess.run(
+                f"./conda create -n {args.conda_name} python={args.python_version} -y".split(),
+                check=True,
+                stdout=subprocess.PIPE,
+                cwd=args.conda_home
+            ).stdout.decode()
+        )
+    for pip_dependency in pip_dependencies:
+        print(
+            subprocess.run(
+                f"./conda run -n {args.conda_name} pip install -U {pip_dependency}".split(),
+                check=True,
+                stdout=subprocess.PIPE,
+                cwd=args.conda_home
+            ).stdout.decode()
+        )
+
+
+if __name__ == "__main__":
+    setup_env()

From d2b413c2f6107cf4409a566e9aa91d87919ff093 Mon Sep 17 00:00:00 2001
From: PanQiWei <594557445@qq.com>
Date: Thu, 27 Apr 2023 10:55:14 +0800
Subject: [PATCH 04/15] make torch installation optional

---
 setup_env.py | 39 ++++++++++++++++++++++++++++++++++++---
 1 file changed, 36 insertions(+), 3 deletions(-)

diff --git a/setup_env.py b/setup_env.py
index 09cac8a..b12ba55 100644
--- a/setup_env.py
+++ b/setup_env.py
@@ -3,10 +3,10 @@
 
 """WARNING: this scripts may only works on linux"""
 
+# change version based on your situation
+pip_torch = "torch==1.13.1+cu116 torchvision==0.14.1+cu116 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu116",
 
 pip_dependencies = [
-    # change version based on your situation
-    "torch==1.13.1+cu116 torchvision==0.14.1+cu116 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu116",
     "transformers==4.25.1",
     "sentencepiece",
     "datasets",
@@ -25,6 +25,7 @@ def setup_env():
     parser.add_argument("--init_conda", action="store_true")
     parser.add_argument("--conda_name", type=str, default="moss")
     parser.add_argument("--python_version", type=str, default="3.10")
+    parser.add_argument("--reinstall_torch", action="store_true")
     args = parser.parse_args()
 
     if args.init_conda:
@@ -36,10 +37,42 @@ def setup_env():
                 cwd=args.conda_home
             ).stdout.decode()
         )
+
+    try:
+        import torch
+    except ImportError:
+        print(
+            subprocess.run(
+                f"./conda run -n {args.conda_name} pip install {pip_torch}".split(),
+                check=True,
+                stdout=subprocess.PIPE,
+                cwd=args.conda_home
+            ).stdout.decode()
+        )
+        args.reinstall_torch = False
+
+    if args.reinstall_torch:
+        print(
+            subprocess.run(
+                f"./conda run -n {args.conda_name} pip uninstall torch -y".split(),
+                check=True,
+                stdout=subprocess.PIPE,
+                cwd=args.conda_home
+            ).stdout.decode()
+        )
+        print(
+            subprocess.run(
+                f"./conda run -n {args.conda_name} pip install {pip_torch}".split(),
+                check=True,
+                stdout=subprocess.PIPE,
+                cwd=args.conda_home
+            ).stdout.decode()
+        )
+
     for pip_dependency in pip_dependencies:
         print(
             subprocess.run(
-                f"./conda run -n {args.conda_name} pip install -U {pip_dependency}".split(),
+                f"./conda run -n {args.conda_name} pip install {pip_dependency}".split(),
                 check=True,
                 stdout=subprocess.PIPE,
                 cwd=args.conda_home

From 33e5b5a718e42db202de85d22efa34d4a02420e1 Mon Sep 17 00:00:00 2001
From: PanQiWei <594557445@qq.com>
Date: Thu, 27 Apr 2023 10:58:58 +0800
Subject: [PATCH 05/15] handle more exceptions

---
 setup_env.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup_env.py b/setup_env.py
index b12ba55..6731663 100644
--- a/setup_env.py
+++ b/setup_env.py
@@ -40,7 +40,7 @@ def setup_env():
 
     try:
         import torch
-    except ImportError:
+    except:
         print(
             subprocess.run(
                 f"./conda run -n {args.conda_name} pip install {pip_torch}".split(),

From a2a0f4a13f1e3eb6e30a5d7a48eea89a3538ba5b Mon Sep 17 00:00:00 2001
From: PanQiWei <594557445@qq.com>
Date: Thu, 27 Apr 2023 11:05:38 +0800
Subject: [PATCH 06/15] make installation of cuda extension and triton optional

---
 setup_env.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/setup_env.py b/setup_env.py
index 6731663..d1c318c 100644
--- a/setup_env.py
+++ b/setup_env.py
@@ -26,6 +26,8 @@ def setup_env():
     parser.add_argument("--conda_name", type=str, default="moss")
     parser.add_argument("--python_version", type=str, default="3.10")
     parser.add_argument("--reinstall_torch", action="store_true")
+    parser.add_argument("--no_cuda_ext_for_auto_gptq", action="store_true")
+    parser.add_argument("--use_triton", action="store_true")
     args = parser.parse_args()
 
     if args.init_conda:
@@ -70,9 +72,14 @@ def setup_env():
         )
 
     for pip_dependency in pip_dependencies:
+        command = f"./conda run -n {args.conda_name} pip install {pip_dependency}"
+        if "auto-gptq" in pip_dependency and args.no_cuda_ext_for_auto_gptq:
+            command = "BUILD_CUDA_EXT=0 " + command
+        if "auto-gptq" in pip_dependency and args.use_triton:
+            command = command.replace("auto-gptq", "auto-gptq[triton]")
         print(
             subprocess.run(
-                f"./conda run -n {args.conda_name} pip install {pip_dependency}".split(),
+                command.split(),
                 check=True,
                 stdout=subprocess.PIPE,
                 cwd=args.conda_home

From b19f5d62ac1ca5bcaea164117506d95e7c213614 Mon Sep 17 00:00:00 2001
From: PanQiWei <594557445@qq.com>
Date: Thu, 27 Apr 2023 11:07:51 +0800
Subject: [PATCH 07/15] bug fix

---
 setup_env.py | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/setup_env.py b/setup_env.py
index d1c318c..d5712c8 100644
--- a/setup_env.py
+++ b/setup_env.py
@@ -40,19 +40,6 @@ def setup_env():
             ).stdout.decode()
         )
 
-    try:
-        import torch
-    except:
-        print(
-            subprocess.run(
-                f"./conda run -n {args.conda_name} pip install {pip_torch}".split(),
-                check=True,
-                stdout=subprocess.PIPE,
-                cwd=args.conda_home
-            ).stdout.decode()
-        )
-        args.reinstall_torch = False
-
     if args.reinstall_torch:
         print(
             subprocess.run(

From 6c550329aeed6f6692a10b8e199280bf72264d40 Mon Sep 17 00:00:00 2001
From: PanQiWei <594557445@qq.com>
Date: Thu, 27 Apr 2023 11:11:26 +0800
Subject: [PATCH 08/15] bug fix

---
 setup_env.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup_env.py b/setup_env.py
index d5712c8..046d8ff 100644
--- a/setup_env.py
+++ b/setup_env.py
@@ -4,7 +4,7 @@
 """WARNING: this scripts may only works on linux"""
 
 # change version based on your situation
-pip_torch = "torch==1.13.1+cu116 torchvision==0.14.1+cu116 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu116",
+pip_torch = "torch==1.13.1+cu116 torchvision==0.14.1+cu116 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu116"
 
 pip_dependencies = [
     "transformers==4.25.1",

From 8e9fdcdb0f323b3c9595a68b7e50726b44806d14 Mon Sep 17 00:00:00 2001
From: PanQiWei <594557445@qq.com>
Date: Thu, 27 Apr 2023 11:13:34 +0800
Subject: [PATCH 09/15] change flag name for more understandable

---
 setup_env.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup_env.py b/setup_env.py
index 046d8ff..3bd1376 100644
--- a/setup_env.py
+++ b/setup_env.py
@@ -27,7 +27,7 @@ def setup_env():
     parser.add_argument("--python_version", type=str, default="3.10")
     parser.add_argument("--reinstall_torch", action="store_true")
     parser.add_argument("--no_cuda_ext_for_auto_gptq", action="store_true")
-    parser.add_argument("--use_triton", action="store_true")
+    parser.add_argument("--install_triton", action="store_true")
     args = parser.parse_args()
 
     if args.init_conda:
@@ -62,7 +62,7 @@ def setup_env():
         command = f"./conda run -n {args.conda_name} pip install {pip_dependency}"
         if "auto-gptq" in pip_dependency and args.no_cuda_ext_for_auto_gptq:
             command = "BUILD_CUDA_EXT=0 " + command
-        if "auto-gptq" in pip_dependency and args.use_triton:
+        if "auto-gptq" in pip_dependency and args.install_triton:
             command = command.replace("auto-gptq", "auto-gptq[triton]")
         print(
             subprocess.run(

From f7406a5b74371ff1407fae3220c1e1208d3a3c59 Mon Sep 17 00:00:00 2001
From: PanQiWei <594557445@qq.com>
Date: Thu, 27 Apr 2023 11:14:13 +0800
Subject: [PATCH 10/15] bug fix

---
 setup_env.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup_env.py b/setup_env.py
index 3bd1376..969cee2 100644
--- a/setup_env.py
+++ b/setup_env.py
@@ -61,7 +61,7 @@ def setup_env():
     for pip_dependency in pip_dependencies:
         command = f"./conda run -n {args.conda_name} pip install {pip_dependency}"
         if "auto-gptq" in pip_dependency and args.no_cuda_ext_for_auto_gptq:
-            command = "BUILD_CUDA_EXT=0 " + command
+            command = f"./conda run BUILD_CUDA_EXT=0 -n {args.conda_name} pip install {pip_dependency}"
         if "auto-gptq" in pip_dependency and args.install_triton:
             command = command.replace("auto-gptq", "auto-gptq[triton]")
         print(

From 4e672b988e58a808fc6741642e888faafa27e329 Mon Sep 17 00:00:00 2001
From: PanQiWei <594557445@qq.com>
Date: Thu, 27 Apr 2023 11:16:14 +0800
Subject: [PATCH 11/15] bug fix

---
 setup_env.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup_env.py b/setup_env.py
index 969cee2..7240896 100644
--- a/setup_env.py
+++ b/setup_env.py
@@ -61,7 +61,7 @@ def setup_env():
     for pip_dependency in pip_dependencies:
         command = f"./conda run -n {args.conda_name} pip install {pip_dependency}"
         if "auto-gptq" in pip_dependency and args.no_cuda_ext_for_auto_gptq:
-            command = f"./conda run BUILD_CUDA_EXT=0 -n {args.conda_name} pip install {pip_dependency}"
+            command = f"./conda run -n {args.conda_name} BUILD_CUDA_EXT=0 pip install {pip_dependency}"
         if "auto-gptq" in pip_dependency and args.install_triton:
             command = command.replace("auto-gptq", "auto-gptq[triton]")
         print(

From d318bed6017fe556839851c07201ddca31cf9d5f Mon Sep 17 00:00:00 2001
From: PanQiWei <594557445@qq.com>
Date: Thu, 27 Apr 2023 11:20:17 +0800
Subject: [PATCH 12/15] add help messages

---
 setup_env.py | 30 ++++++++++++++++++++++++------
 1 file changed, 24 insertions(+), 6 deletions(-)

diff --git a/setup_env.py b/setup_env.py
index 7240896..4806d9e 100644
--- a/setup_env.py
+++ b/setup_env.py
@@ -21,13 +21,31 @@
 
 def setup_env():
     parser = ArgumentParser()
-    parser.add_argument("--conda_home", type=str, default="/root/miniconda3/bin")
-    parser.add_argument("--init_conda", action="store_true")
+    parser.add_argument(
+        "--conda_home",
+        type=str,
+        default="/root/miniconda3/bin",
+        help="path to where your conda executable installed"
+    )
     parser.add_argument("--conda_name", type=str, default="moss")
-    parser.add_argument("--python_version", type=str, default="3.10")
-    parser.add_argument("--reinstall_torch", action="store_true")
-    parser.add_argument("--no_cuda_ext_for_auto_gptq", action="store_true")
-    parser.add_argument("--install_triton", action="store_true")
+    parser.add_argument(
+        "--init_conda",
+        action="store_true",
+        help="whether to create a new conda environment whose name is 'conda_name', make sure it's not exists."
+    )
+    parser.add_argument(
+        "--python_version",
+        type=str,
+        default="3.10",
+        help="python version used when creating conda env"
+    )
+    parser.add_argument("--reinstall_torch", action="store_true", help="whether to reinstall pytorch or not.")
+    parser.add_argument(
+        "--no_cuda_ext_for_auto_gptq",
+        action="store_true",
+        help="whether to not install CUDA extension for auto-gptq"
+    )
+    parser.add_argument("--install_triton", action="store_true", help="whether to install triton")
     args = parser.parse_args()
 
     if args.init_conda:

From 04562ca452ab250e4a6c0bda5d4eae4223da6b07 Mon Sep 17 00:00:00 2001
From: PanQiWei <594557445@qq.com>
Date: Thu, 27 Apr 2023 11:29:23 +0800
Subject: [PATCH 13/15] make installation of auto_gptq optional

---
 setup_env.py | 77 ++++++++++++++++++++++++++++------------------------
 1 file changed, 42 insertions(+), 35 deletions(-)

diff --git a/setup_env.py b/setup_env.py
index 4806d9e..2fa4860 100644
--- a/setup_env.py
+++ b/setup_env.py
@@ -6,6 +6,8 @@
 # change version based on your situation
 pip_torch = "torch==1.13.1+cu116 torchvision==0.14.1+cu116 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu116"
 
+pip_auto_gptq = "auto-gptq -i https://pypi.org/simple"
+
 pip_dependencies = [
     "transformers==4.25.1",
     "sentencepiece",
@@ -14,11 +16,21 @@
     "matplotlib",
     "huggingface_hub",
     "gradio",
-    "auto-gptq -i https://pypi.org/simple",
     "mdtex2html"
 ]
 
 
+def run_command_and_show(cmd: str, conda_home):
+    print(
+        subprocess.run(
+            cmd.split(),
+            check=True,
+            stdout=subprocess.PIPE,
+            cwd=conda_home
+        ).stdout.decode()
+    )
+
+
 def setup_env():
     parser = ArgumentParser()
     parser.add_argument(
@@ -40,55 +52,50 @@ def setup_env():
         help="python version used when creating conda env"
     )
     parser.add_argument("--reinstall_torch", action="store_true", help="whether to reinstall pytorch or not.")
+    parser.add_argument("--install_auto_gptq", action="store_true", help="whether to install auto-gptq")
     parser.add_argument(
         "--no_cuda_ext_for_auto_gptq",
         action="store_true",
-        help="whether to not install CUDA extension for auto-gptq"
+        help="whether to not install CUDA extension for auto-gptq, only effects when set flag --install_auto_gptq"
+    )
+    parser.add_argument(
+        "--install_triton",
+        action="store_true",
+        help="whether to install triton, only effects when set flag --install_auto_gptq"
     )
-    parser.add_argument("--install_triton", action="store_true", help="whether to install triton")
     args = parser.parse_args()
 
     if args.init_conda:
-        print(
-            subprocess.run(
-                f"./conda create -n {args.conda_name} python={args.python_version} -y".split(),
-                check=True,
-                stdout=subprocess.PIPE,
-                cwd=args.conda_home
-            ).stdout.decode()
+        run_command_and_show(
+            cmd=f"./conda create -n {args.conda_name} python={args.python_version} -y",
+            conda_home=args.conda_home
         )
 
     if args.reinstall_torch:
-        print(
-            subprocess.run(
-                f"./conda run -n {args.conda_name} pip uninstall torch -y".split(),
-                check=True,
-                stdout=subprocess.PIPE,
-                cwd=args.conda_home
-            ).stdout.decode()
+        run_command_and_show(
+            cmd=f"./conda run -n {args.conda_name} pip uninstall torch -y",
+            conda_home=args.conda_home
         )
-        print(
-            subprocess.run(
-                f"./conda run -n {args.conda_name} pip install {pip_torch}".split(),
-                check=True,
-                stdout=subprocess.PIPE,
-                cwd=args.conda_home
-            ).stdout.decode()
+        run_command_and_show(
+            cmd=f"./conda run -n {args.conda_name} pip install {pip_torch}",
+            conda_home=args.conda_home
         )
 
     for pip_dependency in pip_dependencies:
-        command = f"./conda run -n {args.conda_name} pip install {pip_dependency}"
-        if "auto-gptq" in pip_dependency and args.no_cuda_ext_for_auto_gptq:
-            command = f"./conda run -n {args.conda_name} BUILD_CUDA_EXT=0 pip install {pip_dependency}"
-        if "auto-gptq" in pip_dependency and args.install_triton:
+        run_command_and_show(
+            cmd=f"./conda run -n {args.conda_name} pip install {pip_dependency}",
+            conda_home=args.conda_home
+        )
+
+    if args.install_auto_gptq:
+        command = f"./conda run -n {args.conda_name} pip install {pip_auto_gptq}"
+        if args.no_cuda_ext_for_auto_gptq:
+            command = f"./conda run -n {args.conda_name} BUILD_CUDA_EXT=0 pip install {pip_auto_gptq}"
+        if args.install_triton:
             command = command.replace("auto-gptq", "auto-gptq[triton]")
-        print(
-            subprocess.run(
-                command.split(),
-                check=True,
-                stdout=subprocess.PIPE,
-                cwd=args.conda_home
-            ).stdout.decode()
+        run_command_and_show(
+            cmd=command,
+            conda_home=args.conda_home
         )
 
 

From 0230cfc0f464169888434eb15f72f966d6450b5e Mon Sep 17 00:00:00 2001
From: PanQiWei <594557445@qq.com>
Date: Sat, 29 Apr 2023 11:12:19 +0800
Subject: [PATCH 14/15] add script to quantize moss using auto-gptq

---
 quant_moss_model.py | 112 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 112 insertions(+)
 create mode 100644 quant_moss_model.py

diff --git a/quant_moss_model.py b/quant_moss_model.py
new file mode 100644
index 0000000..804bf4b
--- /dev/null
+++ b/quant_moss_model.py
@@ -0,0 +1,112 @@
+import glob
+import json
+import random
+import time
+from argparse import ArgumentParser
+from os.path import join
+from functools import partial
+
+import torch
+from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
+from datasets import Dataset
+from transformers import AutoTokenizer
+
+with_plug_in_data_path = "SFT_data/conversations/conversation_with_plugins"
+without_plug_in_data_path = "SFT_data/conversations/conversation_without_plugins"
+
+
+def load_data(tokenizer, with_plugin=False):
+    def _load_data(data_dir):
+        print(f"load data files from {data_dir}")
+        for file in glob.glob(join(data_dir, "**/*.json"), recursive=True):
+            with open(file, "r", encoding="utf-8") as f:
+                data = json.load(f)
+                num_turns = data["num_turns"]
+                prompt = data["meta_instruction"]
+                for i in range(num_turns):
+                    turn = data["chat"][f"turn_{i + 1}"]
+                    for field in ["Human", "Inner Thoughts", "Commands", "Tool Responses", "MOSS"]:
+                        prompt += turn[field]
+            tokenized_data = tokenizer(prompt, truncation=True)
+            ds.append(tokenized_data)
+
+    ds = []
+    _load_data(without_plug_in_data_path)
+    if with_plugin:
+        _load_data(with_plug_in_data_path)
+
+    ds = sorted(ds, key=lambda x: len(x["input_ids"]))
+
+    print(f"use {len(ds)} examples to quantize model, {with_plugin=}")
+
+    return ds
+
+
+def main():
+    parser = ArgumentParser()
+    parser.add_argument("--pretrained_model_dir", type=str)
+    parser.add_argument("--quantized_model_dir", type=str, default=None)
+    parser.add_argument("--bits", type=int, default=4, choices=[2, 3, 4, 8])
+    parser.add_argument("--group_size", type=int, default=128)
+    parser.add_argument("--save_and_reload", action="store_true", help="whether save quantized model to disk and reload back")
+    parser.add_argument("--fast_tokenizer", action="store_true", help="whether use fast tokenizer")
+    parser.add_argument("--use_triton", action="store_true", help="whether use triton to speedup at inference")
+    parser.add_argument("--per_gpu_max_memory", type=int, default=None, help="max memory used to load model per gpu")
+    parser.add_argument("--cpu_max_memory", type=int, default=None, help="max memory used to offload model to cpu")
+    parser.add_argument("--quant_batch_size", type=int, default=1, help="examples batch size for quantization")
+    parser.add_argument("--with_plugin_data", action="store_true", help="whether use plugin data to quantize model")
+    args = parser.parse_args()
+
+    max_memory = dict()
+    if args.per_gpu_max_memory is not None and args.per_gpu_max_memory > 0:
+        if torch.cuda.is_available():
+            max_memory.update(
+                {i: f"{args.per_gpu_max_memory}GIB" for i in range(torch.cuda.device_count())}
+            )
+    if args.cpu_max_memory is not None and args.cpu_max_memory > 0 and max_memory:
+        max_memory["cpu"] = f"{args.cpu_max_memory}GIB"
+    if not max_memory:
+        max_memory = None
+
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.pretrained_model_dir,
+        use_fast=args.fast_tokenizer,
+        trust_remote_code=True
+    )
+    model = AutoGPTQForCausalLM.from_pretrained(
+        args.pretrained_model_dir,
+        quantize_config=BaseQuantizeConfig(bits=args.bits, group_size=args.group_size),
+        max_memory=max_memory
+    )
+
+    examples = load_data(tokenizer, with_plugin=args.with_plugin_data)
+    examples_for_quant = [
+        {"input_ids": example["input_ids"], "attention_mask": example["attention_mask"]}
+        for example in examples
+    ]
+
+    start = time.time()
+    model.quantize(
+        examples_for_quant,
+        batch_size=args.quant_batch_size,
+        use_triton=args.use_triton,
+        autotune_warmup_after_quantized=args.use_triton
+    )
+    end = time.time()
+    print(f"quantization took: {end - start: .4f}s")
+
+    if not args.quantized_model_dir:
+        args.quantized_model_dir = args.pretrained_model_dir
+
+    model.save_quantized(args.quantized_model_dir)
+    print(f"quantized model saved to {args.quantized_model_dir}")
+
+
+if __name__ == "__main__":
+    import logging
+
+    logging.basicConfig(
+        format="%(asctime)s %(levelname)s [%(name)s] %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S"
+    )
+
+    main()

From 45e1b94bca2664ff471b2f6ba2eb7ff1428835ef Mon Sep 17 00:00:00 2001
From: PanQiWei <594557445@qq.com>
Date: Sat, 29 Apr 2023 11:22:32 +0800
Subject: [PATCH 15/15] install auto-gptq from source code

---
 setup_env.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup_env.py b/setup_env.py
index 2fa4860..a070716 100644
--- a/setup_env.py
+++ b/setup_env.py
@@ -6,7 +6,7 @@
 # change version based on your situation
 pip_torch = "torch==1.13.1+cu116 torchvision==0.14.1+cu116 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu116"
 
-pip_auto_gptq = "auto-gptq -i https://pypi.org/simple"
+pip_auto_gptq = "git+https://github.com/PanQiWei/AutoGPTQ.git"
 
 pip_dependencies = [
     "transformers==4.25.1",