pytorch
diff --git a/‎test/prototype/test_autoround.py
+174 b/‎test/prototype/test_autoround.py
+174
diff --git a/‎torchao/_models/llama/benchmarks.sh
+12 b/‎torchao/_models/llama/benchmarks.sh
+12
diff --git a/‎torchao/_models/llama/generate.py
+49-2 b/‎torchao/_models/llama/generate.py
+49-2
diff --git a/‎torchao/_models/llama/model.py
+9 b/‎torchao/_models/llama/model.py
+9
diff --git a/‎torchao/prototype/autoround/README.md
+104 b/‎torchao/prototype/autoround/README.md
+104
@@ -0,0 +1,174 @@
+import pytest
+from torchao.prototype.autoround.utils import is_auto_round_available
+
+if not is_auto_round_available():
+    pytest.skip("AutoRound is not available", allow_module_level=True)
+
+import torch
+from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    parametrize,
+    run_tests,
+    TestCase,
+)
+from torchao import quantize_
+
+from torchao.dtypes import AffineQuantizedTensor
+from torchao.prototype.autoround.core import (
+    apply_auto_round,
+    prepare_model_for_applying_auto_round_,
+)
+from torchao.prototype.autoround.multi_tensor import MultiTensor
+from torchao.utils import TORCH_VERSION_AT_LEAST_2_5
+
+_AVAILABLE_DEVICES = ["cpu"] + (["cuda"] if torch.cuda.is_available() else [])
+
+
+# Copied from https://github.com/pytorch/ao/pull/721
+class TwoLinear(torch.nn.Module):
+    def __init__(self, in_features=64, out_features=128):
+        super().__init__()
+        self.linear1 = torch.nn.Linear(in_features, out_features)
+        self.linear2 = torch.nn.Linear(in_features, out_features)
+
+    def forward(self, x, y):
+        x = self.linear1(x)
+        y = self.linear2(y)
+        return x + y
+
+
+class M(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.two_linear1 = TwoLinear()
+        self.two_linear2 = TwoLinear(128, 256)
+
+    def forward(self, x, y):
+        x1 = self.two_linear1(x, y)
+        x2 = self.two_linear2(x1, x1)
+        return x2
+
+
+def _is_two_linear(mod, fqn):
+    return isinstance(mod, TwoLinear)
+
+
+class ModelWithInplaceOp(torch.nn.Module):
+    def __init__(self, DIM=128):
+        super().__init__()
+        self.lin = torch.nn.Linear(DIM, DIM)
+        self.register_buffer("other", torch.zeros(DIM, DIM))
+
+    def forward(self, x, idx):
+        x = x + self.lin(x)
+        # update buffer
+        self.other[idx] = x
+        return x
+
+
+class M2(torch.nn.Module):
+    def __init__(self, DIM=128):
+        super().__init__()
+        self.m1 = ModelWithInplaceOp(DIM)
+        self.m2 = ModelWithInplaceOp(DIM)
+
+    def forward(self, x, idx):
+        x = self.m1(x, idx)
+        x = self.m2(x, idx)
+        return x
+
+
+def _check_params_and_buffers_type(module, check_fun):
+    return [check_fun(p) for p in module.parameters()] + [
+        check_fun(b) for b in module.buffers()
+    ]
+
+
+class TestAutoRound(TestCase):
+
+    @pytest.mark.skip(not TORCH_VERSION_AT_LEAST_2_5, "Requires torch 2.5 or later")
+    @parametrize("device", _AVAILABLE_DEVICES)
+    @torch.no_grad()
+    def test_auto_round(self, device: str):
+        example_inputs = (
+            torch.randn(32, 64).to(device),
+            torch.randn(32, 64).to(device),
+        )
+        m = M().eval().to(device)
+        before_quant = m(*example_inputs)
+        prepare_model_for_applying_auto_round_(
+            m,
+            is_target_module=_is_two_linear,
+            bits=7,
+            group_size=32,
+            iters=20,
+            device=device,
+        )
+        assert all(
+            _check_params_and_buffers_type(m, lambda x: isinstance(x, MultiTensor))
+        ), "Expected all parameters and buffers to be `MultiTensor`."
+        input1 = []
+        input2 = []
+        for _ in range(10):
+            input1.append(torch.randn(32, 64).to(device))
+            input2.append(torch.randn(32, 64).to(device))
+
+        mt_input1 = MultiTensor(input1)
+        mt_input2 = MultiTensor(input2)
+        out = m(mt_input1, mt_input2)
+        assert isinstance(out, MultiTensor), f"Expected MultiTensor, got {type(out)}"
+        assert all(
+            _check_params_and_buffers_type(m, lambda x: not isinstance(x, MultiTensor))
+        ), "Expected all parameters and buffers have been converted back to tensor."
+        quantize_(m, apply_auto_round(), _is_two_linear, device=device)
+        for l in m.modules():
+            if isinstance(l, torch.nn.Linear):
+                assert isinstance(l.weight, AffineQuantizedTensor)
+        after_quant = m(*example_inputs)
+        assert after_quant is not None, "Quantized model forward pass failed"
+
+    @pytest.mark.skip(not TORCH_VERSION_AT_LEAST_2_5, "Requires torch 2.5 or later")
+    @parametrize("device", _AVAILABLE_DEVICES)
+    @torch.no_grad()
+    def test_wrap_model_with_multi_tensor(self, device: str):
+
+        _is_model_with_inplace_op = lambda mod, fqn: isinstance(mod, ModelWithInplaceOp)
+
+        DIM = 128
+        m = M2(DIM).eval().to(device)
+        prepare_model_for_applying_auto_round_(
+            m,
+            is_target_module=_is_model_with_inplace_op,
+            bits=7,
+            group_size=32,
+            iters=20,
+            device=device,
+        )
+        assert all(
+            _check_params_and_buffers_type(m, lambda x: isinstance(x, MultiTensor))
+        ), "Expected all parameters and buffers to be `MultiTensor`."
+        input1 = []
+        input2 = []
+        for _ in range(2):
+            input1.append(torch.randn(DIM, DIM).to(device))
+            input2.append(torch.randint(0, DIM, (DIM,), dtype=torch.long).to(device))
+
+        mt_input1 = MultiTensor(input1)
+        mt_input2 = MultiTensor(input2)
+        out = m(mt_input1, mt_input2)
+        assert isinstance(out, MultiTensor), f"Expected MultiTensor, got {type(out)}"
+        assert all(
+            _check_params_and_buffers_type(m, lambda x: not isinstance(x, MultiTensor))
+        ), "Expected all parameters and buffers have been converted back to tensor."
+        quantize_(m, apply_auto_round(), _is_model_with_inplace_op, device=device)
+        for l in m.modules():
+            if isinstance(l, torch.nn.Linear):
+                assert isinstance(l.weight, AffineQuantizedTensor)
+        after_quant = m(input1[0], input2[0])
+        assert after_quant is not None, "Quantized model forward pass failed"
+
+
+instantiate_parametrized_tests(TestAutoRound)
+
+if __name__ == "__main__":
+    run_tests()
@@ -12,6 +12,12 @@ python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --co
 python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization int4wo-64 --write_result benchmark_results.txt
 python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --quantization autoquant --write_result benchmark_results.txt
 
+# auto-round w/ quant_lm_head
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization  autoround
+# auto-round w/o quant_lm_head
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization  autoround-cuda-0
+
+
 export MODEL_REPO=meta-llama/Meta-Llama-3-8B
 python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --precision torch.float32 --write_result benchmark_results.txt
 python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt
@@ -23,6 +29,12 @@ python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --co
 python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization int4wo-64 --write_result benchmark_results.txt
 python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --quantization autoquant --write_result benchmark_results.txt
 
+# auto-round w/ quant_lm_head
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization  autoround
+# auto-round w/o quant_lm_head
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization  autoround-cuda-0
+
+
 export MODEL_REPO=meta-llama/Meta-Llama-3.1-8B
 python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 8192
 python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --cache_size 8192 --kv_cache_quantization
 
@@ -30,7 +30,7 @@ def device_sync(device):
 wd = Path(__file__).parent.parent.resolve()
 sys.path.append(str(wd))
 
-from torchao._models.llama.model import Transformer, prepare_inputs_for_model
+from torchao._models.llama.model import Transformer, prepare_inputs_for_model, TransformerBlock
 from torchao._models.llama.tokenizer import get_tokenizer
 
 def multinomial_sample_one_no_sync(probs_sort): # Does multinomial sampling without a cuda synchronization
@@ -219,6 +219,53 @@ def main(
             groupsize=int(quantization.split("-")[-1])
             assert groupsize in [32,64,128,256], f"int4wo groupsize needs to be one of [32,64,128,256] but got {groupsize}"
             quantize_(model, int4_weight_only(group_size=groupsize))
+
+        if "autoround" in quantization:
+            from torchao.prototype.autoround.autoround_llm import quantize_model_with_autoround_
+            from transformers import AutoTokenizer
+
+            _tokenizer = AutoTokenizer.from_pretrained(checkpoint_path.parent)
+            # parse args from quantization string:
+            #   autoround-<model_device>-<quant_lm_head>-<iters>-<groupsize>-<batch_size>-<seqlen>-<nsamples>
+            #   A lightweight configuration for generation benchmarking.
+            _quant_args = quantization.split("-")
+            _default_quant_args = [True, 1, 128, 1, 512, 32]
+            _model_devie = _quant_args[1] if len(_quant_args) > 1 else device
+            _quant_args = _quant_args[2:]
+            quant_lm_head, iters, groupsize, batch_size, seqlen, nsamples = [
+                int(x) for x in _quant_args
+            ] + _default_quant_args[len(_quant_args) :]
+            model = model.to(_model_devie)
+            print(
+                (
+                    f"Quantizing model with autoround(iters={iters}, groupsize={groupsize}, "
+                    f"quant_lm_head={quant_lm_head}, batch_size={batch_size}, seqlen={seqlen}, nsamples={nsamples})"
+                )
+            )
+            with torch.device(_model_devie):
+                model.setup_caches(
+                    max_batch_size=batch_size, max_seq_length=seqlen, training=True
+                )
+
+            if quant_lm_head:
+                is_target_module = (
+                    lambda mod, fqn: isinstance(mod, TransformerBlock) or "output" in fqn
+                )
+            else:
+                is_target_module = lambda mod, fqn: isinstance(mod, TransformerBlock)
+            quantize_model_with_autoround_(
+                model=model,
+                tokenizer=_tokenizer,
+                is_target_module=is_target_module,
+                bits=4,
+                seqlen=seqlen,
+                bs=batch_size,
+                iters=iters,
+                nsamples=nsamples,
+            )
+            model.to(device)
+            model.reset_caches()
+
         if "fp6" in quantization:
             quantize_(model, fpx_weight_only(3, 2))
         if "autoquant" == quantization:
@@ -387,7 +434,7 @@ def callback(x):
     parser.add_argument('--top_k', type=int, default=200, help='Top-k for sampling.')
     parser.add_argument('--temperature', type=float, default=0.8, help='Temperature for sampling.')
     parser.add_argument('--checkpoint_path', type=Path, default=Path("../../../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth"), help='Model checkpoint path.')
-    parser.add_argument('-q', '--quantization', type=str, help='Which quantization techniques to apply: int8dq, int8wo, int4wo-<groupsize>, autoquant')
+    parser.add_argument('-q', '--quantization', type=str, help='Which quantization techniques to apply: int8dq, int8wo, int4wo-<groupsize>, autoquant, autoround-<model_device>-<quant_lm_head>-<iters>-<groupsize>-<batch_size>-<seqlen>-<nsamples>')
     parser.add_argument('--kv_cache_quantization', action='store_true', help='Whether to quantize the KV cache')
     parser.add_argument('--cache_size', type=int, default=None, help='Force size of cache to be a certain number of tokens, if not set, will use max_new_tokens+prompt_size')
     parser.add_argument('--linear_causal_mask', action='store_true', help='Whether to use the memory efficient, but slightly less fast, linear causal mask (important for long context lengths)')
 
@@ -190,7 +190,16 @@ def setup_caches(self, max_batch_size, max_seq_length, training: bool=False, kv_
             dtype, 
             use_scaled=self.config.use_scaled_rope
         )
+
+    def reset_caches(self):
+        """Reset caches.
         
+        The caches used by training stage and inference stage may be different, reset them before switching.
+        """
+        self.max_batch_size = -1
+        self.max_seq_length = -1
+        self.freqs_cis: Optional[Tensor] = None
+        self.mask_cache: Optional[Tensor] = None
 
     def forward(self, idx: Tensor, input_pos: Optional[Tensor] = None) -> Tensor:
         """Forward pass of the model.
 
@@ -0,0 +1,104 @@
+# Auto-Round
+
+Auto-Round is an advanced quantization algorithm designed for low-bit LLM inference. It leverages [sign gradient descent](https://arxiv.org/abs/1905.12938) to fine-tune rounding values and minmax values of weights. This approach competes impressively with recent methods without introducing any additional inference overhead while using low tuning costs. This module provides the end-to-end examples to quantize floating-point models to low-bit and integration with torchao's `quantize_` API and low-bit kernels.
+
+## Usage
+
+### Quick Start
+
+```python
+python autoround_llm.py -m /model/name/or/path
+```
+
+
+> [!NOTE]
+> Before running, ensure you have installed the `auto-round` with `pip install -r requirements.txt`.
+
+
+### Detailed Usage
+
+`Auto-Round` is a calibration-based quantization algorithm. The flow involves three main steps: 1) insert hooks to the modules you want to quantize, 2) Wrap the calibration data with `MultiTensor` and run the model, 3) Replace the optimized weight with `AffineQuantizedTensor` to select the appropriate low-bit kernel.
+
+> [!NOTE]
+> To learn more about the flow and `MultiTensor`, please refer to [this example](https://github.com/pytorch/ao/blob/main/tutorials/calibration_flow/gptq_like.py).
+
+#### Step 1: Prepare the Model
+```python
+model = ...  # Load your model
+model_device = next(model.parameters()).device
+device = "cuda" if torch.cuda.is_available() else "cpu"
+
+# Define a function to identify target modules for quantization.
+# For example, to apply Auto-Round to all decoder layers and the `lm-head` in a Llama model:
+decoder_cls = transformers.models.llama.modeling_llama.LlamaDecoderLayer
+is_target_module = lambda mod, fqn: isinstance(mod, decoder_cls) or "lm_head" in fqn
+# Prepare the model for Auto-Round
+from torchao.prototype.autoround.core import prepare_model_for_applying_auto_round_
+
+prepare_model_for_applying_auto_round_(
+    model,
+    is_target_module=is_target_module,
+    bits=4,
+    group_size=128,
+    iters=200,
+    device=device,
+)
+```
+> [!NOTE]
+> To avoid OOM issues, load the model on CPU, and set `device` to `'cuda'`.
+
+#### Step 2: Apply Optimization
+Wrap all inputs as a `MultiTensor` to track all calibration data for optimized modules:
+
+```python
+input_ids_lst = []
+for data in dataloader:
+    input_ids_lst.append(data["input_ids"].to(model_device))
+
+multi_t_input_ids = MultiTensor(input_ids_lst)
+# The optimization is applied during the forward pass
+out = model(multi_t_input_ids)
+```
+#### Step 3: Finalize Quantization
+After obtaining optimized `zero_point` and `scale` values, create the `AffineQuantizedTensor` 
+for each target weight to select the right low-bits kernel.
+
+```python
+from torchao.prototype.autoround.core import apply_auto_round
+
+quantize_(model, apply_auto_round(), is_target_module)
+```
+
+## End-to-End Results
+### [meta-llama/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct)
+|                 | Avg.    | Mmlu   | Piqa   | Winogrande | Hellaswag | Lambada_openai |
+| --------------  | ------- | ------ | ------ | ---------- | --------- | -------------- |
+| bf16            | 0.7080  | 0.6783 | 0.8003 | 0.7403     | 0.5910    | 0.7303         |
+| auto-round-4bit | 0.6988  | 0.6533 | 0.7949 | 0.7372     | 0.5837    | 0.7250         |
+| torchao-int4wo  | 0.6883  | 0.6363 | 0.7938 | 0.7348     | 0.5784    | 0.6980         |
+
+### [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct)
+|                 | Avg.    | Mmlu   | Piqa   | Winogrande | Hellaswag | Lambada_openai |
+| --------------  | ------- | ------ | ------ | ---------- | --------- | -------------- |
+| bf16            | 0.6881  | 0.6389 | 0.7840 | 0.7222     | 0.5772    | 0.7184         |
+| auto-round-4bit | 0.6818  | 0.6232 | 0.7862 | 0.7230     | 0.5661    | 0.7105         |
+| torchao-int4wo  | 0.6728  | 0.5939 | 0.7737 | 0.7222     | 0.5612    | 0.7132         |
+
+
+### [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)
+|                 | Avg.    | Mmlu   | Piqa   | Winogrande | Hellaswag | Lambada_openai |
+| --------------  | ------- | ------ | ------ | ---------- | --------- | -------------- |
+| bf16            | 0.6347  | 0.4647 | 0.7644 | 0.6606     | 0.577     | 0.7070         |
+| auto-round-4bit | 0.6327  | 0.4534 | 0.7590 | 0.6661     | 0.5706    | 0.7143         |
+| torchao-int4wo  | 0.6252  | 0.4427 | 0.7617 | 0.6654     | 0.5674    | 0.6889         |
+
+> [!NOTE]
+> - `auto-round-4bit` represents the following configuration: `bits=4`, `iters=200`, `seqlen=2048`, `train_bs=8`, `group_size=128`, and `quant_lm_head=False`. <br>
+> - `torchao-int4wo` represents `int4_weight_only(group_size=128)` and `quant_lm_head=False`.
+> - If the model includes operations without a deterministic implementation (such as Flash Attention), the results may differ slightly.
+
+
+## Credits
+
+- Paper: https://arxiv.org/abs/2309.05516
+- Authors: [Intel® Neural Compressor Team](https://github.com/intel/neural-compressor)