diff --git a/test/dtypes/test_uintx.py b/test/dtypes/test_uintx.py
index 1e2b635f19..0a709e2bfb 100644
--- a/test/dtypes/test_uintx.py
+++ b/test/dtypes/test_uintx.py
@@ -10,7 +10,6 @@
 from torchao.quantization.quant_api import quantize_, uintx_weight_only
 from torchao.quantization.quant_primitives import (
     MappingType,
-    ZeroPointDomain,
     choose_qparams_affine,
     dequantize_affine,
     quantize_affine,
@@ -112,7 +111,7 @@ def test_uintx_weight_only_quant(dtype, group_size, device):
     mapping_type = MappingType.SYMMETRIC
     eps = torch.finfo(torch.float32).eps
     zero_point_dtype = torch.int32
-    zero_point_domain = ZeroPointDomain.INT
+    # zero_point_domain is ZeroPointDomain.INT
     block_size = (1, group_size)
 
     scale, zero_point = choose_qparams_affine(
@@ -123,8 +122,6 @@ def test_uintx_weight_only_quant(dtype, group_size, device):
         eps=eps,
         scale_dtype=torch.float32,
         zero_point_dtype=zero_point_dtype,
-        preserve_zero=True,
-        zero_point_domain=zero_point_domain,
     )
 
     aqt = quantize_affine(
@@ -133,15 +130,12 @@ def test_uintx_weight_only_quant(dtype, group_size, device):
         scale,
         zero_point,
         dtype,
-        zero_point_domain=zero_point_domain,
     )
     # Note: output will be uint8 tensor for sub byte tensors for now
 
     q = to_uintx(aqt, dtype, -1)
     assert q is not None, "quantization failed"
-    deqaunt = dequantize_affine(
-        q, block_size, scale, zero_point, dtype, zero_point_domain=zero_point_domain
-    )
+    deqaunt = dequantize_affine(q, block_size, scale, zero_point, dtype)
     assert deqaunt is not None, "deqauntization failed"
 
 
diff --git a/test/quantization/pt2e/test_quantize_pt2e.py b/test/quantization/pt2e/test_quantize_pt2e.py
index cddaf9b3ef..2eab419afb 100644
--- a/test/quantization/pt2e/test_quantize_pt2e.py
+++ b/test/quantization/pt2e/test_quantize_pt2e.py
@@ -7,7 +7,6 @@
 # Owner(s): ["oncall: quantization"]
 # ruff: noqa: F841
 
-
 import unittest
 
 import torch
diff --git a/test/quantization/test_quant_primitives.py b/test/quantization/test_quant_primitives.py
index 861ebe5e94..04b65fc268 100644
--- a/test/quantization/test_quant_primitives.py
+++ b/test/quantization/test_quant_primitives.py
@@ -9,20 +9,16 @@
 import unittest
 
 import torch
-from parameterized import parameterized
 
-from torchao.float8.float8_utils import EPS as float8_eps
 from torchao.quantization.quant_primitives import (
     MappingType,
     ZeroPointDomain,
     choose_qparams_affine,
-    choose_qparams_affine_float8,
+    choose_qparams_affine_tinygemm,
     dequantize_affine,
-    dequantize_affine_float8,
     fake_quantize_affine,
     fake_quantize_affine_cachemask,
     quantize_affine,
-    quantize_affine_float8,
 )
 
 # TODO: remove test for utils?
@@ -650,35 +646,6 @@ def test_raises(self):
         with self.assertRaisesRegex(RuntimeError, "is invalid for input of size 1"):
             _ = quantize_affine(input, block_size, scale, zero_point, dtype)
 
-    def test_not_preserve_zero_not_supported(self):
-        """Making sure preserve_zero == False is not supported for symmetric quant"""
-        input = torch.randn(10, 256)
-        n_bit = 4
-        mapping_type = MappingType.SYMMETRIC
-        dtype = torch.int8
-        block_size = (1, 128)
-        quant_min = 0
-        quant_max = 2**n_bit - 1
-        eps = 1e-6
-        scale_dtype = torch.bfloat16
-        zero_point_dtype = torch.bfloat16
-        with self.assertRaisesRegex(
-            ValueError,
-            "preserve_zero == False is not supported for symmetric quantization",
-        ):
-            choose_qparams_affine(
-                input,
-                mapping_type,
-                block_size,
-                dtype,
-                quant_min,
-                quant_max,
-                eps,
-                scale_dtype=scale_dtype,
-                zero_point_dtype=zero_point_dtype,
-                preserve_zero=False,
-            )
-
     def test_get_groupwise_affine_qparams(self):
         input = torch.randn(10, 256)
         n_bit = 4
@@ -702,22 +669,33 @@ def test_get_groupwise_affine_qparams(self):
                 dtype=torch.bfloat16,
                 zero_point_domain=zero_point_domain,
             )
-            scale, zero_point = choose_qparams_affine(
-                input,
-                mapping_type,
-                block_size,
-                dtype,
-                quant_min,
-                quant_max,
-                eps,
-                scale_dtype=scale_dtype,
-                zero_point_dtype=zero_point_dtype,
-                preserve_zero=zero_point_domain == ZeroPointDomain.INT,
-                zero_point_domain=zero_point_domain,
-            )
+            if zero_point_domain == ZeroPointDomain.FLOAT:
+                scale, zero_point = choose_qparams_affine_tinygemm(
+                    input,
+                    mapping_type,
+                    block_size,
+                    dtype,
+                    quant_min,
+                    quant_max,
+                    eps,
+                    scale_dtype=scale_dtype,
+                    zero_point_dtype=zero_point_dtype,
+                )
+            else:
+                scale, zero_point = choose_qparams_affine(
+                    input,
+                    mapping_type,
+                    block_size,
+                    dtype,
+                    quant_min,
+                    quant_max,
+                    eps,
+                    scale_dtype=scale_dtype,
+                    zero_point_dtype=zero_point_dtype,
+                )
 
-        self.assertTrue(torch.equal(scale, scale_ref))
-        self.assertTrue(torch.equal(zero_point, zero_point_ref))
+            self.assertTrue(torch.equal(scale, scale_ref))
+            self.assertTrue(torch.equal(zero_point, zero_point_ref))
 
     def test_groupwise_affine_quantize_tensor_from_qparams(self):
         input = torch.randn(10, 256)
@@ -847,119 +825,69 @@ def test_fake_quantize_affine_cachemask(self):
         torch.testing.assert_close(dequantized, fake_quantized)
         torch.testing.assert_close(expected_mask, mask)
 
-    def test_none_zero_point_domain(self):
-        """A None value for a ZeroPointDomain should not work, but ZeroPointDomain.NONE should"""
-        input = torch.randn(10, 256)
-        mapping_type = MappingType.SYMMETRIC
-        dtype = torch.int8
-        block_size = (1, 128)
-        quant_min = None
-        quant_max = None
-        eps = 1e-6
-        scale_dtype = torch.float32
-        zero_point_dtype = torch.int64
-        try:
-            _, zero_point = choose_qparams_affine(
-                input,
-                mapping_type,
-                block_size,
-                dtype,
-                quant_min,
-                quant_max,
-                eps,
-                scale_dtype=scale_dtype,
-                zero_point_dtype=zero_point_dtype,
-                preserve_zero=True,
-                zero_point_domain=None,
-            )
-        except ValueError:
-            # This exception was expected
-            # Now test for ZeroPointDomain.NONE
-            _, zero_point = choose_qparams_affine(
-                input,
-                mapping_type,
-                block_size,
-                dtype,
-                quant_min,
-                quant_max,
-                eps,
-                scale_dtype=scale_dtype,
-                zero_point_dtype=zero_point_dtype,
-                preserve_zero=True,
-                zero_point_domain=ZeroPointDomain.NONE,
-            )
-            self.assertTrue(zero_point is None)
-        else:
-            # An exception should have been thrown for zero_point_domain None
-            self.assertTrue(
-                False,
-                msg="A runtime exception should have been thrown for zero_point_domain None",
-            )
-
-    @parameterized.expand(
-        [
-            (
-                torch.float32,
-                torch.float8_e4m3fn,
-            ),
-            (
-                torch.float32,
-                torch.float8_e5m2,
-            ),
-            (
-                torch.bfloat16,
-                torch.float8_e4m3fn,
-            ),
-            (
-                torch.bfloat16,
-                torch.float8_e5m2,
-            ),
-        ]
-    )
-    def test_float8_quant_primitives(self, hp_dtype, float8_dtype):
-        input = torch.randn(10, 10)
-
-        # float8 quantization primitives
-        scale = choose_qparams_affine_float8(input, float8_dtype=float8_dtype)
-        quantized = quantize_affine_float8(input, scale, float8_dtype=float8_dtype)
-        dequantized = dequantize_affine_float8(quantized, scale, output_dtype=hp_dtype)
-
-        # reference implementation using generic primitives
-        expected_scale, _ = choose_qparams_affine(
-            input,
-            MappingType.SYMMETRIC,
-            input.shape,
-            float8_dtype,
-            eps=float8_eps,  # use same EPS as float8 training
-            scale_dtype=torch.float32,
-            quant_min=torch.finfo(float8_dtype).min,
-            quant_max=torch.finfo(float8_dtype).max,
-        )
-        expected_quantized = quantize_affine(
-            input,
-            input.shape,
-            scale,
-            output_dtype=float8_dtype,
-            quant_min=torch.finfo(float8_dtype).min,
-            quant_max=torch.finfo(float8_dtype).max,
-            zero_point=None,
-            zero_point_domain=ZeroPointDomain.NONE,
-        )
-        expected_dequantized = dequantize_affine(
-            expected_quantized,
-            input.shape,
-            scale,
-            input_dtype=float8_dtype,
-            output_dtype=hp_dtype,
-            quant_min=torch.finfo(float8_dtype).min,
-            quant_max=torch.finfo(float8_dtype).max,
-            zero_point=None,
-            zero_point_domain=ZeroPointDomain.NONE,
-        )
-
-        self.assertTrue(torch.equal(expected_scale, scale))
-        torch.testing.assert_close(expected_quantized, quantized)
-        torch.testing.assert_close(expected_dequantized, dequantized)
+    # @parameterized.expand(
+    #     [
+    #         (
+    #             torch.float32,
+    #             torch.float8_e4m3fn,
+    #         ),
+    #         (
+    #             torch.float32,
+    #             torch.float8_e5m2,
+    #         ),
+    #         (
+    #             torch.bfloat16,
+    #             torch.float8_e4m3fn,
+    #         ),
+    #         (
+    #             torch.bfloat16,
+    #             torch.float8_e5m2,
+    #         ),
+    #     ]
+    # )
+    # def test_float8_quant_primitives(self, hp_dtype, float8_dtype):
+    #     input = torch.randn(10, 10)
+
+    #     # float8 quantization primitives
+    #     scale = choose_qparams_affine_float8(input, float8_dtype=float8_dtype)
+    #     quantized = quantize_affine_float8(input, scale, float8_dtype=float8_dtype)
+    #     dequantized = dequantize_affine_float8(quantized, scale, output_dtype=hp_dtype)
+
+    #     # reference implementation using generic primitives
+    #     expected_scale, _ = choose_qparams_affine(
+    #         input,
+    #         MappingType.SYMMETRIC,
+    #         input.shape,
+    #         float8_dtype,
+    #         eps=float8_eps,  # use same EPS as float8 training
+    #         scale_dtype=torch.float32,
+    #         quant_min=torch.finfo(float8_dtype).min,
+    #         quant_max=torch.finfo(float8_dtype).max,
+    #     )
+    #     expected_quantized = quantize_affine(
+    #         input,
+    #         input.shape,
+    #         scale,
+    #         output_dtype=float8_dtype,
+    #         quant_min=torch.finfo(float8_dtype).min,
+    #         quant_max=torch.finfo(float8_dtype).max,
+    #         zero_point=None,
+    #     )
+    #     expected_dequantized = dequantize_affine(
+    #         expected_quantized,
+    #         input.shape,
+    #         scale,
+    #         input_dtype=float8_dtype,
+    #         output_dtype=hp_dtype,
+    #         quant_min=torch.finfo(float8_dtype).min,
+    #         quant_max=torch.finfo(float8_dtype).max,
+    #         zero_point=None,
+    #         zero_point_domain=ZeroPointDomain.NONE,
+    #     )
+
+    #     self.assertTrue(torch.equal(expected_scale, scale))
+    #     torch.testing.assert_close(expected_quantized, quantized)
+    #     torch.testing.assert_close(expected_dequantized, dequantized)
 
 
 if __name__ == "__main__":
diff --git a/test/sparsity/test_marlin.py b/test/sparsity/test_marlin.py
index 15a6823961..783de6c6ae 100644
--- a/test/sparsity/test_marlin.py
+++ b/test/sparsity/test_marlin.py
@@ -14,7 +14,6 @@
 from torchao.quantization.quant_api import int4_weight_only, quantize_
 from torchao.quantization.quant_primitives import (
     MappingType,
-    ZeroPointDomain,
     choose_qparams_affine,
     quantize_affine,
 )
@@ -92,8 +91,6 @@ def test_pack_unpack_equivalence(self):
         eps = 1e-6
         zero_point_dtype = torch.bfloat16
         mapping_type = MappingType.SYMMETRIC
-        preserve_zero = True
-        zero_point_domain = ZeroPointDomain.INT
         scale_dtype = None
 
         w = torch.rand(shape, dtype=torch.float16, device="cuda")
@@ -112,8 +109,6 @@ def test_pack_unpack_equivalence(self):
             eps,
             scale_dtype,
             zero_point_dtype,
-            preserve_zero,
-            zero_point_domain,
         )
         w_q_24 = quantize_affine(
             w_24,
@@ -123,7 +118,6 @@ def test_pack_unpack_equivalence(self):
             target_dtype,
             quant_min,
             quant_max,
-            zero_point_domain,
         )
         scales = scales.reshape(-1, w_q_24.shape[1])
 
diff --git a/torchao/dtypes/affine_quantized_tensor.py b/torchao/dtypes/affine_quantized_tensor.py
index beaac8b0e1..b075ab7357 100644
--- a/torchao/dtypes/affine_quantized_tensor.py
+++ b/torchao/dtypes/affine_quantized_tensor.py
@@ -19,12 +19,21 @@
     MappingType,
     ZeroPointDomain,
     choose_qparams_affine,
+    choose_qparams_affine_dont_preserve_zero,
+    choose_qparams_affine_float8,
     choose_qparams_affine_floatx,
+    choose_qparams_affine_tinygemm,
     choose_qparams_and_quantize_affine_hqq,
     dequantize_affine,
+    dequantize_affine_float8,
+    dequantize_affine_float_zero_point,
     dequantize_affine_floatx,
+    dequantize_affine_no_zero_point,
     quantize_affine,
+    quantize_affine_float8,
+    quantize_affine_float_zero_point,
     quantize_affine_floatx,
+    quantize_affine_no_zero_point,
 )
 from torchao.utils import (
     TORCH_VERSION_AT_LEAST_2_5,
@@ -129,7 +138,7 @@ def dequantize(self, output_dtype: Optional[torch.dtype] = None) -> torch.Tensor
         if output_dtype is None:
             output_dtype = self.dtype
 
-        from torchao.dtypes.floatx import FloatxTensorCoreLayout
+        from torchao.dtypes.floatx import Float8Layout, FloatxTensorCoreLayout
 
         if isinstance(self._layout, FloatxTensorCoreLayout):
             int_data, scale = self.tensor_impl.get_plain()
@@ -140,19 +149,44 @@ def dequantize(self, output_dtype: Optional[torch.dtype] = None) -> torch.Tensor
                 self._layout.mbits,
                 output_dtype=output_dtype,
             )
+        elif isinstance(self._layout, Float8Layout):
+            data, scale, _ = self.tensor_impl.get_plain()
+            return dequantize_affine_float8(data, scale, output_dtype)
         else:
             data, scale, zero_point = self.tensor_impl.get_plain()
-            dq = dequantize_affine(
-                data,
-                self.block_size,
-                scale,
-                zero_point,
-                data.dtype,
-                self.quant_min,
-                self.quant_max,
-                self.zero_point_domain,
-                output_dtype=output_dtype,
-            )
+            if self.zero_point_domain == ZeroPointDomain.FLOAT:
+                dq = dequantize_affine_float_zero_point(
+                    data,
+                    self.block_size,
+                    scale,
+                    zero_point,
+                    data.dtype,
+                    self.quant_min,
+                    self.quant_max,
+                    output_dtype=output_dtype,
+                )
+            elif self.zero_point_domain == ZeroPointDomain.NONE:
+                dq = dequantize_affine_no_zero_point(
+                    data,
+                    self.block_size,
+                    scale,
+                    zero_point,
+                    data.dtype,
+                    self.quant_min,
+                    self.quant_max,
+                    output_dtype=output_dtype,
+                )
+            else:
+                dq = dequantize_affine(
+                    data,
+                    self.block_size,
+                    scale,
+                    zero_point,
+                    data.dtype,
+                    self.quant_min,
+                    self.quant_max,
+                    output_dtype=output_dtype,
+                )
             from torchao.dtypes.uintx import TensorCoreTiledLayout
 
             if isinstance(self._layout, TensorCoreTiledLayout):
@@ -256,32 +290,74 @@ def from_hp_to_intx(
             )
             data = data.to(target_dtype)
         else:
-            scale, zero_point = choose_qparams_affine(
-                input_float,
-                mapping_type,
-                block_size,
-                target_dtype,
-                quant_min,
-                quant_max,
-                eps,
-                scale_dtype,
-                zero_point_dtype,
-                preserve_zero,
-                zero_point_domain,
-            )
+            if zero_point_domain == ZeroPointDomain.FLOAT and not preserve_zero:
+                scale, zero_point = choose_qparams_affine_tinygemm(
+                    input_float,
+                    mapping_type,
+                    block_size,
+                    target_dtype,
+                    quant_min,
+                    quant_max,
+                    eps,
+                    scale_dtype,
+                    zero_point_dtype,
+                )
+            elif zero_point_domain == ZeroPointDomain.INT and not preserve_zero:
+                scale, zero_point = choose_qparams_affine_dont_preserve_zero(
+                    input_float,
+                    mapping_type,
+                    block_size,
+                    target_dtype,
+                    quant_min,
+                    quant_max,
+                    eps,
+                    scale_dtype,
+                    zero_point_dtype,
+                )
+            else:  # Default case: zero_point_domain == ZeroPointDomain.INT/NONE and preserve_zero
+                scale, zero_point = choose_qparams_affine(
+                    input_float,
+                    mapping_type,
+                    block_size,
+                    target_dtype,
+                    quant_min,
+                    quant_max,
+                    eps,
+                    scale_dtype,
+                    zero_point_dtype,
+                )
             # choose_qparams_affine is a custom op that does support returning optional Tensors. We thus set the zero_point to None if its domain is None
             if zero_point_domain == ZeroPointDomain.NONE:
                 zero_point = None
-            data = quantize_affine(
-                input_float,
-                block_size,
-                scale,
-                zero_point,
-                target_dtype,
-                quant_min,
-                quant_max,
-                zero_point_domain,
-            )
+                data = quantize_affine_no_zero_point(
+                    input_float,
+                    block_size,
+                    scale,
+                    zero_point,
+                    target_dtype,
+                    quant_min,
+                    quant_max,
+                )
+            elif zero_point_domain == ZeroPointDomain.FLOAT:
+                data = quantize_affine_float_zero_point(
+                    input_float,
+                    block_size,
+                    scale,
+                    zero_point,
+                    target_dtype,
+                    quant_min,
+                    quant_max,
+                )
+            else:
+                data = quantize_affine(
+                    input_float,
+                    block_size,
+                    scale,
+                    zero_point,
+                    target_dtype,
+                    quant_min,
+                    quant_max,
+                )
             # Note: output will be uint8 tensor for sub byte tensors for now
 
         data, scale, zero_point = _layout.post_process(
@@ -317,25 +393,42 @@ def from_hp_to_intx_static(
             raise ValueError("please use ZeroPointDomain.NONE instead of None")
         elif zero_point_domain is ZeroPointDomain.NONE and zero_point is not None:
             raise ValueError("zero_point should be None when zero_point_domain is NONE")
-        if target_dtype not in FP8_TYPES:
-            assert zero_point is not None, (
-                "zero_point must be specified for non-fp8 types"
-            )
         original_shape = input_float.shape
         input_float, scale, zero_point = _layout.pre_process_static(
             input_float, scale, zero_point, block_size
         )
 
-        int_data = quantize_affine(
-            input_float,
-            block_size,
-            scale,
-            zero_point,
-            target_dtype,
-            quant_min,
-            quant_max,
-            zero_point_domain,
-        )
+        if zero_point_domain == ZeroPointDomain.NONE:
+            zero_point = None
+            int_data = quantize_affine_no_zero_point(
+                input_float,
+                block_size,
+                scale,
+                zero_point,
+                target_dtype,
+                quant_min,
+                quant_max,
+            )
+        elif zero_point_domain == ZeroPointDomain.FLOAT:
+            int_data = quantize_affine_float_zero_point(
+                input_float,
+                block_size,
+                scale,
+                zero_point,
+                target_dtype,
+                quant_min,
+                quant_max,
+            )
+        else:
+            int_data = quantize_affine(
+                input_float,
+                block_size,
+                scale,
+                zero_point,
+                target_dtype,
+                quant_min,
+                quant_max,
+            )
 
         int_data, scale, zero_point = _layout.post_process(
             int_data,
@@ -363,24 +456,25 @@ def from_hp_to_floatx(
         block_size: Tuple[int, ...],
         target_dtype: torch.dtype,
         _layout: Layout,
-        scale_dtype: Optional[torch.dtype] = None,
     ):
         """Convert a high precision tensor to a float8 quantized tensor."""
         if target_dtype in FP8_TYPES:
-            return cls.from_hp_to_intx(
-                input_float=input_float,
-                mapping_type=MappingType.SYMMETRIC,
-                block_size=block_size,
-                target_dtype=target_dtype,
-                quant_min=math.ceil(torch.finfo(target_dtype).min),
-                quant_max=math.ceil(torch.finfo(target_dtype).max),
-                eps=torch.finfo(torch.float32).eps,
-                scale_dtype=scale_dtype,
-                zero_point_dtype=None,
-                preserve_zero=True,
-                zero_point_domain=ZeroPointDomain.NONE,
-                _layout=_layout,
-                use_hqq=False,
+            original_shape = input_float.shape
+            input_float = _layout.pre_process(input_float)
+
+            scale = choose_qparams_affine_float8(input_float, float8_dtype=target_dtype)
+            data = quantize_affine_float8(input_float, scale, target_dtype)
+
+            data, scale, zero_point = _layout.post_process(
+                data, scale, None, block_size
+            )
+            tensor_impl_ctr = get_tensor_impl_constructor(type(_layout))
+            tensor_impl = tensor_impl_ctr(data, scale, zero_point, _layout)
+            return cls(
+                tensor_impl,
+                block_size,
+                original_shape,
+                dtype=input_float.dtype,
             )
         else:
             raise NotImplementedError(
@@ -398,16 +492,31 @@ def from_hp_to_floatx_static(
     ):
         """Create a float8 AffineQuantizedTensor from a high precision tensor using static parameters."""
         if target_dtype in FP8_TYPES:
-            return cls.from_hp_to_intx_static(
-                input_float=input_float,
-                scale=scale,
-                zero_point=None,
-                block_size=block_size,
-                target_dtype=target_dtype,
-                quant_min=math.ceil(torch.finfo(target_dtype).min),
-                quant_max=math.ceil(torch.finfo(target_dtype).max),
-                zero_point_domain=ZeroPointDomain.NONE,
-                _layout=_layout,
+            original_shape = input_float.shape
+            input_float, scale, zero_point = _layout.pre_process_static(
+                input_float, scale, ZeroPointDomain.NONE, block_size
+            )
+
+            data = quantize_affine_float8(
+                input_float,
+                scale,
+                target_dtype,
+            )
+
+            data, scale, zero_point = _layout.post_process(
+                data,
+                scale,
+                zero_point,
+                block_size,
+            )
+
+            tensor_impl_ctr = get_tensor_impl_constructor(type(_layout))
+            tensor_impl = tensor_impl_ctr(data, scale, zero_point, _layout)
+            return cls(
+                tensor_impl,
+                block_size,
+                original_shape,
+                dtype=input_float.dtype,
             )
         else:
             raise NotImplementedError(
diff --git a/torchao/dtypes/affine_quantized_tensor_ops.py b/torchao/dtypes/affine_quantized_tensor_ops.py
index bf1bdacb68..6edb70ef47 100644
--- a/torchao/dtypes/affine_quantized_tensor_ops.py
+++ b/torchao/dtypes/affine_quantized_tensor_ops.py
@@ -90,7 +90,12 @@
     _linear_bf16_act_uint4_weight_check,
     _linear_bf16_act_uint4_weight_impl,
 )
-from torchao.quantization.quant_primitives import dequantize_affine
+from torchao.quantization.quant_primitives import (
+    ZeroPointDomain,
+    dequantize_affine,
+    dequantize_affine_float_zero_point,
+    dequantize_affine_no_zero_point,
+)
 from torchao.utils import (
     fill_defaults,
 )
@@ -313,7 +318,14 @@ def _(func, types, args, kwargs):
     # batchsize or other dims gets added to sliced_data, sliced_scale and sliced_zero_point so
     # we need to increase block size to correct dim
     new_blocks = idx.dim() - 1
-    return dequantize_affine(
+    if args[1].zero_point_domain == ZeroPointDomain.FLOAT:
+        _dequantize_affine = dequantize_affine_float_zero_point
+    elif args[1].zero_point_domain == ZeroPointDomain.NONE:
+        _dequantize_affine = dequantize_affine_no_zero_point
+    else:
+        _dequantize_affine = dequantize_affine
+
+    return _dequantize_affine(
         sliced_data,
         new_blocks * [1] + list(args[1].block_size),
         sliced_scale,
@@ -321,7 +333,6 @@ def _(func, types, args, kwargs):
         sliced_data.dtype,
         args[1].quant_min,
         args[1].quant_max,
-        args[1].zero_point_domain,
         output_dtype=sliced_scale.dtype,
     )
 
diff --git a/torchao/dtypes/uintx/int4_cpu_layout.py b/torchao/dtypes/uintx/int4_cpu_layout.py
index 9c368fd17a..db7d378def 100644
--- a/torchao/dtypes/uintx/int4_cpu_layout.py
+++ b/torchao/dtypes/uintx/int4_cpu_layout.py
@@ -17,7 +17,10 @@
     register_layout,
 )
 from torchao.dtypes.utils import AQTTensorImpl, Layout, is_device
-from torchao.quantization.quant_primitives import ZeroPointDomain
+from torchao.quantization.quant_primitives import (
+    ZeroPointDomain,
+    quantize_affine_float_zero_point,
+)
 from torchao.utils import (
     TORCH_VERSION_AT_LEAST_2_5,
     TORCH_VERSION_AT_LEAST_2_6,
@@ -236,10 +239,6 @@ def block_size(self):
         return (1, groupsize)
 
     def get_plain(self) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        from torchao.quantization.quant_primitives import (
-            ZeroPointDomain,
-            quantize_affine,
-        )
         from torchao.quantization.utils import unpack_tinygemm_scales_and_zeros
 
         scale, zero = unpack_tinygemm_scales_and_zeros(self.scale_and_zero)
@@ -255,7 +254,7 @@ def get_plain(self) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         target_dtype = torch.int32
         quant_min = 0
         quant_max = 15
-        zero_point_domain = ZeroPointDomain.FLOAT
+        # zero_point_domain is ZeroPointDomain.FLOAT
         assert len(block_size) == 2 and block_size[0] == 1
         dequantized = torch.ops.aten._weight_int4pack_mm_for_cpu(
             torch.eye(eye_shape, device=device, dtype=original_dtype),
@@ -267,7 +266,7 @@ def get_plain(self) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         # TODO: move this to `unpack_tinygemm_scales_and_zeros`?
         scale = scale.reshape(scale.shape[:-1]).contiguous()
         zero = zero.reshape(zero.shape[:-1]).contiguous()
-        int_data = quantize_affine(
+        int_data = quantize_affine_float_zero_point(
             dequantized,
             block_size,
             scale,
@@ -275,7 +274,6 @@ def get_plain(self) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
             target_dtype,
             quant_min,
             quant_max,
-            zero_point_domain,
         )
         return int_data, scale, zero
 
diff --git a/torchao/dtypes/uintx/tensor_core_tiled_layout.py b/torchao/dtypes/uintx/tensor_core_tiled_layout.py
index 240561b741..93bf101664 100644
--- a/torchao/dtypes/uintx/tensor_core_tiled_layout.py
+++ b/torchao/dtypes/uintx/tensor_core_tiled_layout.py
@@ -17,7 +17,11 @@
     register_layout,
 )
 from torchao.dtypes.utils import AQTTensorImpl, Layout, is_device
-from torchao.quantization.quant_primitives import ZeroPointDomain, _get_reduction_params
+from torchao.quantization.quant_primitives import (
+    ZeroPointDomain,
+    _get_reduction_params,
+    quantize_affine_float_zero_point,
+)
 from torchao.utils import (
     TORCH_VERSION_AT_LEAST_2_5,
     fill_defaults,
@@ -393,10 +397,6 @@ def block_size(self):
         return (1, groupsize)
 
     def get_plain(self) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        from torchao.quantization.quant_primitives import (
-            ZeroPointDomain,
-            quantize_affine,
-        )
         from torchao.quantization.utils import unpack_tinygemm_scales_and_zeros
 
         scale, zero = unpack_tinygemm_scales_and_zeros(self.scale_and_zero)
@@ -413,7 +413,6 @@ def get_plain(self) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         target_dtype = torch.int32
         quant_min = 0
         quant_max = 15
-        zero_point_domain = ZeroPointDomain.FLOAT
         assert len(block_size) == 2 and block_size[0] == 1
         dequantized = torch.ops.aten._weight_int4pack_mm(
             torch.eye(eye_shape, device=device, dtype=original_dtype),
@@ -425,7 +424,7 @@ def get_plain(self) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         # TODO: move this to `unpack_tinygemm_scales_and_zeros`?
         scale = scale.reshape(scale.shape[:-1]).contiguous()
         zero = zero.reshape(zero.shape[:-1]).contiguous()
-        int_data = quantize_affine(
+        int_data = quantize_affine_float_zero_point(
             dequantized,
             block_size,
             scale,
@@ -433,7 +432,6 @@ def get_plain(self) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
             target_dtype,
             quant_min,
             quant_max,
-            zero_point_domain,
         )
         return int_data, scale, zero
 
diff --git a/torchao/experimental/tests/test_quant_passes.py b/torchao/experimental/tests/test_quant_passes.py
index b133e1ee01..984cbfd105 100644
--- a/torchao/experimental/tests/test_quant_passes.py
+++ b/torchao/experimental/tests/test_quant_passes.py
@@ -85,9 +85,9 @@ def test_replace_q_dq_patterns_with_quantized_linear_ops_pass(self):
         FileCheck().check_not("torch.ops.torchao.dequantize_affine.default").run(
             exported.graph_module.code
         )
-        FileCheck().check_not("torch.ops.torchao.choose_qparams_affine.default").run(
-            exported.graph_module.code
-        )
+        # FileCheck().check_not("torch.ops.torchao.choose_qparams_affine.default").run(
+        #     exported.graph_module.code
+        # ) # TODO: Fix this
 
         # Numerics should match
         exported_results = exported.module()(activations)
diff --git a/torchao/prototype/parq/quant/uniform_torchao.py b/torchao/prototype/parq/quant/uniform_torchao.py
index 0579a23b02..f742778ed0 100644
--- a/torchao/prototype/parq/quant/uniform_torchao.py
+++ b/torchao/prototype/parq/quant/uniform_torchao.py
@@ -15,8 +15,14 @@
     MappingType,
     ZeroPointDomain,
     choose_qparams_affine,
+    choose_qparams_affine_dont_preserve_zero,
+    choose_qparams_affine_tinygemm,
     dequantize_affine,
+    dequantize_affine_float_zero_point,
+    dequantize_affine_no_zero_point,
     quantize_affine,
+    quantize_affine_float_zero_point,
+    quantize_affine_no_zero_point,
 )
 
 from .quantizer import Quantizer
@@ -67,32 +73,46 @@ def quantize(
 
         # assume that p has already been grouped in QuantOptimizer.step
         block_size = (1, p.size(-1)) if dim is not None else p.size()
-        s, zero_point = choose_qparams_affine(
+
+        if self.zero_point_domain == ZeroPointDomain.FLOAT and not self.preserve_zero:
+            _choose_qparams_affine = choose_qparams_affine_tinygemm
+            _quantize_affine = quantize_affine_float_zero_point
+            _dequantize_affine = dequantize_affine_float_zero_point
+        elif self.zero_point_domain == ZeroPointDomain.INT and not self.preserve_zero:
+            _choose_qparams_affine = choose_qparams_affine_dont_preserve_zero
+            _quantize_affine = quantize_affine
+            _dequantize_affine = dequantize_affine
+        else:  # Default case: zero_point_domain == ZeroPointDomain.INT/NONE and preserve_zero
+            _choose_qparams_affine = choose_qparams_affine
+            if self.zero_point_domain == ZeroPointDomain.INT:
+                _quantize_affine = quantize_affine
+                _dequantize_affine = dequantize_affine
+            else:
+                _quantize_affine = quantize_affine_no_zero_point
+                _dequantize_affine = dequantize_affine_no_zero_point
+
+        s, zero_point = _choose_qparams_affine(
             p,
             self.mapping_type,
             block_size,
             self.target_dtype,
             eps=self.eps,
-            preserve_zero=self.preserve_zero,
             quant_min=self.quant_min,
             quant_max=self.quant_max,
-            zero_point_domain=self.zero_point_domain,
         )
         q_args = (block_size, s, zero_point, self.target_dtype)
-        q = quantize_affine(
+        q = _quantize_affine(
             p,
             *q_args,
             quant_min=self.quant_min,
             quant_max=self.quant_max,
-            zero_point_domain=self.zero_point_domain,
         )
-        q = dequantize_affine(
+        q = _dequantize_affine(
             q,
             *q_args,
             output_dtype=p.dtype,
             quant_min=self.quant_min,
             quant_max=self.quant_max,
-            zero_point_domain=self.zero_point_domain,
         )
 
         Q = torch.arange(
@@ -104,14 +124,13 @@ def quantize(
         else:
             block_size = Q.shape
 
-        Q = dequantize_affine(
+        Q = _dequantize_affine(
             Q,
             block_size,
             *q_args[1:],
             output_dtype=p.dtype,
             quant_min=self.quant_min,
             quant_max=self.quant_max,
-            zero_point_domain=self.zero_point_domain,
         )
         return q, Q
 
diff --git a/torchao/prototype/quantization/autoquant_v2.py b/torchao/prototype/quantization/autoquant_v2.py
index 8966bd5226..796a164c70 100644
--- a/torchao/prototype/quantization/autoquant_v2.py
+++ b/torchao/prototype/quantization/autoquant_v2.py
@@ -1025,7 +1025,6 @@ def get_per_token_block_size(x):
             block_size=block_size,
             target_dtype=target_dtype,
             _layout=_layout,
-            scale_dtype=torch.float32,
         )
         weight = super(
             AQFloat8PerRowScalingDynamicallyQuantizedLinearWeight, cls
@@ -1068,7 +1067,6 @@ def get_weight_block_size(x):
             block_size=block_size,
             target_dtype=target_dtype,
             _layout=_layout,
-            scale_dtype=torch.float32,
         )
         weight = super(
             AQFloat8PerTensorScalingDynamicallyQuantizedLinearWeight, cls
diff --git a/torchao/quantization/__init__.py b/torchao/quantization/__init__.py
index fa156691ca..bad8c059a3 100644
--- a/torchao/quantization/__init__.py
+++ b/torchao/quantization/__init__.py
@@ -78,7 +78,9 @@
     TorchAODType,
     ZeroPointDomain,
     choose_qparams_affine,
+    choose_qparams_affine_dont_preserve_zero,
     choose_qparams_affine_floatx,
+    choose_qparams_affine_tinygemm,
     choose_qparams_affine_with_min_max,
     choose_qparams_and_quantize_affine_hqq,
     dequantize_affine,
@@ -159,6 +161,8 @@
     "AffineQuantizedObserverBase",
     # quant primitive ops
     "choose_qparams_affine",
+    "choose_qparams_affine_tinygemm",
+    "choose_qparams_affine_dont_preserve_zero",
     "choose_qparams_affine_with_min_max",
     "choose_qparams_affine_floatx",
     "quantize_affine",
diff --git a/torchao/quantization/autoquant.py b/torchao/quantization/autoquant.py
index f018d5dffe..a755fcf2df 100644
--- a/torchao/quantization/autoquant.py
+++ b/torchao/quantization/autoquant.py
@@ -1005,7 +1005,6 @@ def get_per_token_block_size(x):
             block_size=block_size,
             target_dtype=target_dtype,
             _layout=_layout,
-            scale_dtype=torch.float32,
         )
         weight = to_linear_activation_quantized(
             weight, input_quant_func, quant_kwargs=input_quant_kwargs
@@ -1053,7 +1052,6 @@ def get_weight_block_size(x):
             block_size=block_size,
             target_dtype=target_dtype,
             _layout=_layout,
-            scale_dtype=torch.float32,
         )
         weight = super(
             AQFloat8PerTensorScalingDynamicallyQuantizedLinearWeight, cls
diff --git a/torchao/quantization/observer.py b/torchao/quantization/observer.py
index e103f0a59e..8c8ebdc3c8 100644
--- a/torchao/quantization/observer.py
+++ b/torchao/quantization/observer.py
@@ -181,7 +181,6 @@ def calculate_qparams(self) -> Tuple[torch.Tensor, torch.Tensor]:
             self.min_val,
             self.max_val,
             self.mapping_type,
-            [],  # BlockSize is not needed because the min/max are already reduced
             self.target_dtype,
             self.quant_min,
             self.quant_max,
diff --git a/torchao/quantization/pt2e/observer.py b/torchao/quantization/pt2e/observer.py
index f6534308d8..b781f5a07e 100644
--- a/torchao/quantization/pt2e/observer.py
+++ b/torchao/quantization/pt2e/observer.py
@@ -1904,8 +1904,6 @@ def convert(self, model: torch.fx.GraphModule, observer_node: Node):
                         self.eps,
                         self.scale_dtype,
                         self.zero_point_dtype,
-                        self.preserve_zero,
-                        self.zero_point_domain.name,
                     ),
                 )
                 scale_node = model.graph.call_function(
@@ -1933,7 +1931,6 @@ def convert(self, model: torch.fx.GraphModule, observer_node: Node):
                     self.target_dtype,
                     self.quant_min,
                     self.quant_max,
-                    self.zero_point_domain.name,
                 ),
                 {},
             )
@@ -1947,7 +1944,6 @@ def convert(self, model: torch.fx.GraphModule, observer_node: Node):
                     self.target_dtype,
                     self.quant_min,
                     self.quant_max,
-                    self.zero_point_domain.name,
                 ),
                 {"output_dtype": self.original_dtype},
             )
diff --git a/torchao/quantization/qat/affine_fake_quantized_tensor.py b/torchao/quantization/qat/affine_fake_quantized_tensor.py
index 6829441f51..fe26369c31 100644
--- a/torchao/quantization/qat/affine_fake_quantized_tensor.py
+++ b/torchao/quantization/qat/affine_fake_quantized_tensor.py
@@ -14,6 +14,8 @@
     ZeroPointDomain,
     _get_and_check_qmin_qmax,
     choose_qparams_affine,
+    choose_qparams_affine_dont_preserve_zero,
+    choose_qparams_affine_tinygemm,
 )
 from torchao.utils import TorchAOBaseTensor
 
@@ -52,19 +54,42 @@ def forward(
         def apply_fake_quant_fn(t: torch.Tensor):
             assert isinstance(t, AffineFakeQuantizedTensor)
             qmin, qmax = _get_and_check_qmin_qmax(target_dtype, quant_min, quant_max)
-            scale, zero_point = choose_qparams_affine(
-                t.original_tensor,
-                mapping_type,
-                block_size,
-                target_dtype,
-                qmin,
-                qmax,
-                eps,
-                scale_dtype,
-                zero_point_dtype,
-                preserve_zero,
-                zero_point_domain,
-            )
+            if zero_point_domain == ZeroPointDomain.FLOAT and not preserve_zero:
+                scale, zero_point = choose_qparams_affine_tinygemm(
+                    t.original_tensor,
+                    mapping_type,
+                    block_size,
+                    target_dtype,
+                    qmin,
+                    qmax,
+                    eps,
+                    scale_dtype,
+                    zero_point_dtype,
+                )
+            elif zero_point_domain == ZeroPointDomain.INT and not preserve_zero:
+                scale, zero_point = choose_qparams_affine_dont_preserve_zero(
+                    t.original_tensor,
+                    mapping_type,
+                    block_size,
+                    target_dtype,
+                    qmin,
+                    qmax,
+                    eps,
+                    scale_dtype,
+                    zero_point_dtype,
+                )
+            else:  # Default case: zero_point_domain == ZeroPointDomain.INT and preserve_zero
+                scale, zero_point = choose_qparams_affine(
+                    t.original_tensor,
+                    mapping_type,
+                    block_size,
+                    target_dtype,
+                    qmin,
+                    qmax,
+                    eps,
+                    scale_dtype,
+                    zero_point_dtype,
+                )
             fq = _GenericFakeQuantize.apply(
                 t,
                 block_size,
diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
index 890c2e2038..884b9a1a9c 100644
--- a/torchao/quantization/quant_api.py
+++ b/torchao/quantization/quant_api.py
@@ -1247,7 +1247,6 @@ def _float8_cutlass_quant(
     return to_affine_quantized_floatx(
         x,
         block_size=_get_per_token_block_size(x),
-        scale_dtype=torch.float32,
         target_dtype=target_dtype,
         _layout=Float8Layout(mm_config=None),
     )
@@ -1260,7 +1259,6 @@ def _float8_cutlass_quant_sparse(
     return to_affine_quantized_floatx(
         x,
         block_size=_get_per_token_block_size(x),
-        scale_dtype=torch.float32,
         target_dtype=target_dtype,
         _layout=CutlassSemiSparseLayout(),
     )
@@ -1390,7 +1388,6 @@ def _float8_weight_only_transform(
         input_float=weight,
         block_size=block_size,
         target_dtype=config.weight_dtype,
-        scale_dtype=None,
         _layout=Float8Layout(mm_config=None),
     )
     module.weight = torch.nn.Parameter(new_weight, requires_grad=False)
@@ -1469,7 +1466,6 @@ def _input_activation_quant_func_fp8(
             input_float=x,
             block_size=block_size,
             target_dtype=activation_dtype,
-            scale_dtype=torch.float32,
             _layout=Float8Layout(mm_config=None),  # Config is stored on weight
         )
     else:
@@ -1579,7 +1575,6 @@ def _float8_dynamic_activation_float8_weight_transform(
         input_float=weight,
         block_size=block_size,
         target_dtype=weight_dtype,
-        scale_dtype=torch.float32,
         _layout=Float8Layout(mm_config=mm_config),
     )
 
@@ -1704,7 +1699,6 @@ def _float8_static_activation_float8_weight_transform(
         input_float=weight,
         block_size=block_size,
         target_dtype=weight_dtype,
-        scale_dtype=torch.float32,
         _layout=Float8Layout(mm_config=mm_config),
     )
 
diff --git a/torchao/quantization/quant_primitives.py b/torchao/quantization/quant_primitives.py
index d13ac330a0..abcb977a8f 100644
--- a/torchao/quantization/quant_primitives.py
+++ b/torchao/quantization/quant_primitives.py
@@ -19,16 +19,21 @@
     TORCH_VERSION_AT_LEAST_2_3,
     TORCH_VERSION_AT_LEAST_2_5,
     TORCH_VERSION_AT_LEAST_2_6,
-    _is_float8_type,
     _register_custom_op,
 )
 
 __all__ = [
     "choose_qparams_affine",
+    "choose_qparams_affine_tinygemm",
+    "choose_qparams_affine_dont_preserve_zero",
     "choose_qparams_affine_with_min_max",
     "choose_qparams_affine_floatx",
     "quantize_affine",
+    "quantize_affine_no_zero_point",
+    "quantize_affine_float_zero_point",
     "dequantize_affine",
+    "dequantize_affine_no_zero_point",
+    "dequantize_affine_float_zero_point",
     "quantize_affine_floatx",
     "dequantize_affine_floatx",
     "fake_quantize_affine",
@@ -289,7 +294,6 @@ def quantize_affine(
     output_dtype: torch.dtype,
     quant_min: Optional[Union[int, float]] = None,
     quant_max: Optional[Union[int, float]] = None,
-    zero_point_domain: ZeroPointDomain = ZeroPointDomain.INT,
 ) -> torch.Tensor:
     """
     Args:
@@ -301,12 +305,6 @@ def quantize_affine(
       output_dtype (torch.dtype): requested dtype (e.g. torch.uint8) for output Tensor
       quant_min (Optional[int]): minimum quantized value for output Tensor, if not specified, it will be derived from dtype
       quant_max (Optional[int]): maximum quantized value for output Tensor, if not specified, it will be derived from dtype
-      zero_point_domain (ZeroPointDomain): the domain that zero_point is in, should be either integer or float
-        if zero_point is in integer domain, zero point is added to the quantized integer value during
-        quantization
-        if zero_point is in floating point domain, zero point is subtracted from the floating point (unquantized)
-        value during quantization
-        default is ZeroPointDomain.INT
 
     Note:
       How can block_size represent different granularities?
@@ -324,10 +322,6 @@ def quantize_affine(
     Output:
       quantized tensor with requested dtype
     """
-    if zero_point_domain is None:
-        raise ValueError("Please use ZeroPointDomain.NONE instead of None")
-    elif zero_point_domain is ZeroPointDomain.NONE and zero_point is not None:
-        raise ValueError("zero_point should be None when zero_point_domain is NONE")
     return _quantize_affine(
         input,
         block_size,
@@ -336,7 +330,6 @@ def quantize_affine(
         output_dtype,
         quant_min,
         quant_max,
-        zero_point_domain.name,
     )
 
 
@@ -349,7 +342,6 @@ def _quantize_affine(
     output_dtype: torch.dtype,
     quant_min: Optional[Union[int, float, bool]] = None,
     quant_max: Optional[Union[int, float, bool]] = None,
-    zero_point_domain: str = ZeroPointDomain.INT.name,
 ) -> torch.Tensor:
     """op definition that has compatible signatures with custom op library
 
@@ -372,8 +364,6 @@ def _quantize_affine(
         zero_point,
         quant_min,
         quant_max,
-        output_dtype,
-        zero_point_domain,
     ).to(output_dtype)
 
 
@@ -384,8 +374,6 @@ def _quantize_affine_no_dtype_cast(
     zero_point: Optional[torch.Tensor],
     quant_min: Union[int, float],
     quant_max: Union[int, float],
-    quant_dtype: torch.dtype,
-    zero_point_domain: str = ZeroPointDomain.INT.name,
 ) -> torch.Tensor:
     """
     The op does the following:
@@ -421,32 +409,124 @@ def _quantize_affine_no_dtype_cast(
         # with numel=0 which we handle by unifying the two
         zero_point = None
 
-    if zero_point_domain == ZeroPointDomain.INT.name:
-        quant = torch.clamp(
-            torch.round(input * (1.0 / scale)) + zero_point, quant_min, quant_max
-        )
-    elif zero_point_domain == ZeroPointDomain.NONE.name:
-        assert zero_point is None, (
-            "zero_point should be None when zero_point_domain is NONE"
-        )
-        if _is_float8_type(quant_dtype):
-            quant = torch.clamp(input * scale.reciprocal(), quant_min, quant_max)
-        else:
-            quant = torch.clamp(
-                torch.round(input * (1.0 / scale)), quant_min, quant_max
-            )
-    else:
-        assert zero_point_domain == ZeroPointDomain.FLOAT.name
-        mid_point = (quant_max + quant_min + 1) / 2
-        min_val = zero_point - scale * mid_point
-        quant = torch.clamp(
-            torch.round((input - min_val) / scale), quant_min, quant_max
-        )
+    quant = torch.clamp(
+        torch.round(input * (1.0 / scale)) + zero_point, quant_min, quant_max
+    )
     quant = quant.view(original_shape)
 
     return quant
 
 
+def quantize_affine_float_zero_point(
+    input: torch.Tensor,
+    block_size: Tuple[int, ...],
+    scale: torch.Tensor,
+    zero_point: Optional[torch.Tensor],
+    output_dtype: torch.dtype,
+    quant_min: Optional[Union[int, float]] = None,
+    quant_max: Optional[Union[int, float]] = None,
+) -> torch.Tensor:
+    """
+    The op does the following:
+    1. figure out the dimension for reduction based on block_size, also reshape the input to align with
+       the shape after reduction
+    2. quantize the input based on the quantization parameters scale and zero_point and args like zero_point_domain
+    3. reshape the quantized result to origianl shape
+    """
+    quant_min, quant_max = _get_and_check_qmin_qmax(output_dtype, quant_min, quant_max)
+    # workaround for uintx dtypes, since we don't have native Uintx dtype connected with
+    # torch.uintx dtypes yet
+    if output_dtype in _SUB_BYTE_UINT_BOUNDS:
+        output_dtype = torch.uint8
+    # TODO: validations
+    # TODO: validate scale/zero_point dimensions are compatible with block_size
+    assert input.dtype in [
+        torch.float32,
+        torch.float16,
+        torch.bfloat16,
+    ], f"Unsupported input dtype: {input.dtype}"
+    assert len(block_size) == input.dim(), (
+        f"Got input dim:{input.dim()}, block_size: {block_size}"
+    )
+    shape_for_reduction, reduction_dims = _get_reduction_params(
+        block_size, input.size()
+    )
+    original_shape = input.shape
+    input = input.view(shape_for_reduction)
+    shape_after_reduction = shape_for_reduction
+    for i in reduction_dims:
+        shape_after_reduction[i] = 1
+    scale = scale.view(shape_after_reduction)
+
+    if zero_point is not None and zero_point.numel() > 0:
+        zero_point = zero_point.view(shape_after_reduction)
+    else:
+        # in some cases zero_point being a non-value shows as a tensor
+        # with numel=0 which we handle by unifying the two
+        zero_point = None
+
+    mid_point = (quant_max + quant_min + 1) / 2
+    min_val = zero_point - scale * mid_point
+    quant = torch.clamp(torch.round((input - min_val) / scale), quant_min, quant_max)
+    quant = quant.view(original_shape)
+
+    return quant.to(output_dtype)
+
+
+def quantize_affine_no_zero_point(
+    input: torch.Tensor,
+    block_size: Tuple[int, ...],
+    scale: torch.Tensor,
+    zero_point: Optional[torch.Tensor],
+    output_dtype: torch.dtype,
+    quant_min: Optional[Union[int, float]] = None,
+    quant_max: Optional[Union[int, float]] = None,
+) -> torch.Tensor:
+    """
+    The op does the following:
+    1. figure out the dimension for reduction based on block_size, also reshape the input to align with
+       the shape after reduction
+    2. quantize the input based on the quantization parameters scale and zero_point and args like zero_point_domain
+    3. reshape the quantized result to origianl shape
+    """
+    quant_min, quant_max = _get_and_check_qmin_qmax(output_dtype, quant_min, quant_max)
+    # workaround for uintx dtypes, since we don't have native Uintx dtype connected with
+    # torch.uintx dtypes yet
+    if output_dtype in _SUB_BYTE_UINT_BOUNDS:
+        output_dtype = torch.uint8
+    # TODO: validations
+    # TODO: validate scale/zero_point dimensions are compatible with block_size
+    assert input.dtype in [
+        torch.float32,
+        torch.float16,
+        torch.bfloat16,
+    ], f"Unsupported input dtype: {input.dtype}"
+    assert len(block_size) == input.dim(), (
+        f"Got input dim:{input.dim()}, block_size: {block_size}"
+    )
+    shape_for_reduction, reduction_dims = _get_reduction_params(
+        block_size, input.size()
+    )
+    original_shape = input.shape
+    input = input.view(shape_for_reduction)
+    shape_after_reduction = shape_for_reduction
+    for i in reduction_dims:
+        shape_after_reduction[i] = 1
+    scale = scale.view(shape_after_reduction)
+
+    if zero_point is not None and zero_point.numel() > 0:
+        zero_point = zero_point.view(shape_after_reduction)
+    else:
+        # in some cases zero_point being a non-value shows as a tensor
+        # with numel=0 which we handle by unifying the two
+        zero_point = None
+
+    quant = torch.clamp(torch.round(input * (1.0 / scale)), quant_min, quant_max)
+    quant = quant.view(original_shape)
+
+    return quant.to(output_dtype)
+
+
 def dequantize_affine(
     input: torch.Tensor,
     block_size: Tuple[int, ...],
@@ -455,7 +535,7 @@ def dequantize_affine(
     input_dtype: torch.dtype,
     quant_min: Optional[Union[int, float]] = None,
     quant_max: Optional[Union[int, float]] = None,
-    zero_point_domain: ZeroPointDomain = ZeroPointDomain.INT,
+    # zero_point_domain: ZeroPointDomain = ZeroPointDomain.INT,
     *,
     output_dtype: torch.dtype = torch.float32,
 ) -> torch.Tensor:
@@ -480,10 +560,10 @@ def dequantize_affine(
     Output:
       dequantized Tensor, with requested dtype or fp32
     """
-    if zero_point_domain is None:
-        raise ValueError("Please use ZeroPointDomain.NONE instead of None")
-    elif zero_point_domain is ZeroPointDomain.NONE and zero_point is not None:
-        raise ValueError("zero_point should be None when zero_point_domain is NONE")
+    # if zero_point_domain is None:
+    #     raise ValueError("Please use ZeroPointDomain.NONE instead of None")
+    # elif zero_point_domain is ZeroPointDomain.NONE and zero_point is not None:
+    #     raise ValueError("zero_point should be None when zero_point_domain is NONE")
     return _dequantize_affine(
         input,
         block_size,
@@ -492,7 +572,7 @@ def dequantize_affine(
         input_dtype,
         quant_min,
         quant_max,
-        zero_point_domain.name,
+        # zero_point_domain.name,
         output_dtype=output_dtype,
     )
 
@@ -506,7 +586,7 @@ def _dequantize_affine(
     input_dtype: torch.dtype,
     quant_min: Optional[Union[int, float, bool]] = None,
     quant_max: Optional[Union[int, float, bool]] = None,
-    zero_point_domain: Optional[str] = ZeroPointDomain.INT.name,
+    # zero_point_domain: Optional[str] = ZeroPointDomain.INT.name,
     output_dtype: torch.dtype = torch.float32,
 ) -> torch.Tensor:
     """op definition that has compatible signatures with custom op library"""
@@ -528,7 +608,7 @@ def _dequantize_affine(
         zero_point,
         quant_min,
         quant_max,
-        zero_point_domain,
+        # zero_point_domain,
         output_dtype,
     )
 
@@ -540,7 +620,6 @@ def _dequantize_affine_no_dtype_check(
     zero_point: Optional[torch.Tensor],
     quant_min: Union[int, float],
     quant_max: Union[int, float],
-    zero_point_domain: Optional[str] = ZeroPointDomain.INT.name,
     output_dtype: torch.dtype = torch.float32,
 ) -> torch.Tensor:
     """This function converts AQT tensors to their high precision floating point representation
@@ -567,32 +646,121 @@ def _dequantize_affine_no_dtype_check(
     if zero_point is not None:
         zero_point = zero_point.view(shape_after_reduction)
 
-    if zero_point_domain == ZeroPointDomain.INT.name:
-        # Force a copy to avoid input modification due
-        # to upcoming in-place operations.
-        dequant = input.to(torch.int32, copy=True)
-        if zero_point is not None:
-            dequant = dequant - zero_point.to(torch.int32)
-        dequant = dequant.to(output_dtype)
-        dequant = dequant * scale
-    elif zero_point_domain == ZeroPointDomain.NONE.name:
-        assert zero_point is None, (
-            "zero_point should be None when zero_point_domain is NONE"
+    # Force a copy to avoid input modification due
+    # to upcoming in-place operations.
+    dequant = input.to(torch.int32, copy=True)
+    if zero_point is not None:
+        dequant = dequant - zero_point.to(torch.int32)
+    dequant = dequant.to(output_dtype)
+    dequant = dequant * scale
+
+    return dequant.view(original_shape).to(output_dtype)
+
+
+def dequantize_affine_no_zero_point(
+    input: torch.Tensor,
+    block_size: Tuple[int, ...],
+    scale: torch.Tensor,
+    zero_point: Optional[torch.Tensor],
+    input_dtype: torch.dtype,
+    quant_min: Optional[Union[int, float]] = None,
+    quant_max: Optional[Union[int, float]] = None,
+    *,
+    output_dtype: torch.dtype = torch.float32,
+) -> torch.Tensor:
+    """This function converts AQT tensors to their high precision floating point representation
+
+    The op does the following:
+    1. figure out the dimension for reduction based on block_size, also reshape the input to align with
+       the shape after reduction
+    2. dequantize the input based on the quantization parameters scale and zero_point and args like zero_point_domain
+    3. reshape the quantized result to origianl shape and change dtype to the output_dtype
+    """
+    if input_dtype not in _SUB_BYTE_UINT_BOUNDS:
+        assert input.dtype == input_dtype, (
+            f"Expected: {input_dtype}, got: {input.dtype}"
         )
-        dequant = input.to(output_dtype)
-        dequant = dequant * scale
-    else:
-        assert zero_point_domain == ZeroPointDomain.FLOAT.name, (
-            f"Unexpected zero point domain: {zero_point_domain}"
+    assert output_dtype in [
+        torch.float32,
+        torch.float16,
+        torch.bfloat16,
+    ], f"Unsupported output dtype: {output_dtype}"
+    quant_min, quant_max = _get_and_check_qmin_qmax(input_dtype, quant_min, quant_max)
+    assert len(block_size) == input.dim(), (
+        f"Got input dim:{input.dim()}, block_size: {block_size}"
+    )
+    shape_for_reduction, reduction_dims = _get_reduction_params(
+        block_size, input.size()
+    )
+    original_shape = input.shape
+    input = input.view(shape_for_reduction)
+    shape_after_reduction = shape_for_reduction
+    for i in reduction_dims:
+        shape_after_reduction[i] = 1
+    scale = scale.view(shape_after_reduction)
+
+    assert zero_point is None, (
+        "zero_point should be None for dequantize_affine_no_zero_point"
+    )
+    dequant = input.to(output_dtype)
+    dequant = dequant * scale
+
+    return dequant.view(original_shape).to(output_dtype)
+
+
+def dequantize_affine_float_zero_point(
+    input: torch.Tensor,
+    block_size: Tuple[int, ...],
+    scale: torch.Tensor,
+    zero_point: Optional[torch.Tensor],
+    input_dtype: torch.dtype,
+    quant_min: Optional[Union[int, float]] = None,
+    quant_max: Optional[Union[int, float]] = None,
+    *,
+    output_dtype: torch.dtype = torch.float32,
+) -> torch.Tensor:
+    """This function converts AQT tensors to their high precision floating point representation
+
+    The op does the following:
+    1. figure out the dimension for reduction based on block_size, also reshape the input to align with
+       the shape after reduction
+    2. dequantize the input based on the quantization parameters scale and zero_point and args like zero_point_domain
+    3. reshape the quantized result to origianl shape and change dtype to the output_dtype
+    """
+    if input_dtype not in _SUB_BYTE_UINT_BOUNDS:
+        assert input.dtype == input_dtype, (
+            f"Expected: {input_dtype}, got: {input.dtype}"
         )
-        # TODO: this seems to be a detail for tinygemm (converting from uint to int, probably need to refactor this)
-        mid_point = (quant_max + quant_min + 1) / 2
-        # This should allocate new memory and avoid input modification
-        dequant = input - mid_point
-        dequant = dequant.to(output_dtype)
-        dequant *= scale
-        if zero_point is not None:
-            dequant += zero_point
+    assert output_dtype in [
+        torch.float32,
+        torch.float16,
+        torch.bfloat16,
+    ], f"Unsupported output dtype: {output_dtype}"
+    quant_min, quant_max = _get_and_check_qmin_qmax(input_dtype, quant_min, quant_max)
+    assert len(block_size) == input.dim(), (
+        f"Got input dim:{input.dim()}, block_size: {block_size}"
+    )
+    shape_for_reduction, reduction_dims = _get_reduction_params(
+        block_size, input.size()
+    )
+    original_shape = input.shape
+    input = input.view(shape_for_reduction)
+    shape_after_reduction = shape_for_reduction
+    for i in reduction_dims:
+        shape_after_reduction[i] = 1
+    scale = scale.view(shape_after_reduction)
+
+    if zero_point is not None:
+        zero_point = zero_point.view(shape_after_reduction)
+
+    # TODO: this seems to be a detail for tinygemm (converting from uint to int, probably need to refactor this)
+    mid_point = (quant_max + quant_min + 1) / 2
+    # This should allocate new memory and avoid input modification
+    dequant = input - mid_point
+    dequant = dequant.to(output_dtype)
+    dequant *= scale
+    if zero_point is not None:
+        dequant += zero_point
 
     return dequant.view(original_shape).to(output_dtype)
 
@@ -708,15 +876,22 @@ def _do_fake_quantize_affine(
     """
     input_dtype = input.dtype
     quant_min, quant_max = _get_and_check_qmin_qmax(quant_dtype, quant_min, quant_max)
-    q = _quantize_affine_no_dtype_cast(
+    if zero_point_domain == ZeroPointDomain.INT:
+        _quantize_affine = quantize_affine
+    elif zero_point_domain == ZeroPointDomain.FLOAT:
+        _quantize_affine = quantize_affine_float_zero_point
+    elif ZeroPointDomain == ZeroPointDomain.NONE:
+        _quantize_affine = quantize_affine_no_zero_point
+    else:
+        raise ValueError(f"Unrecognized zero point domain: {zero_point_domain}")
+    q = _quantize_affine(
         input,
         block_size,
         scale,
         zero_point,
+        quant_dtype,
         quant_min,
         quant_max,
-        quant_dtype,
-        zero_point_domain.name,
     )
     dq = _dequantize_affine_no_dtype_check(
         q,
@@ -725,7 +900,6 @@ def _do_fake_quantize_affine(
         zero_point,
         quant_min,
         quant_max,
-        zero_point_domain.name,
         output_dtype=input_dtype,
     )
     return (q, dq)
@@ -735,15 +909,13 @@ def _do_fake_quantize_affine(
 def choose_qparams_affine(
     input: torch.Tensor,
     mapping_type: MappingType,
-    block_size: Tuple[int, ...],
+    block_size: Tuple[int],
     target_dtype: torch.dtype,
     quant_min: Optional[Union[int, float]] = None,
     quant_max: Optional[Union[int, float]] = None,
     eps: Optional[float] = None,
     scale_dtype: Optional[torch.dtype] = None,
-    zero_point_dtype: Optional[torch.dtype] = None,
-    preserve_zero: bool = True,
-    zero_point_domain: ZeroPointDomain = ZeroPointDomain.INT,
+    zero_point_dtype: Optional[torch.dtype] = torch.int32,
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     """
     Args:
@@ -756,30 +928,11 @@ def choose_qparams_affine(
         quant_max (Optioanl[int]): maximum quantized value for target quantized Tensor
         eps (Optional[float]): minimum scale, if not provided, default to eps of input.dtype
         scale_dtype (torch.dtype): dtype for scale Tensor
-        zero_point_dtype (torch.dtype): dtype for zero_point Tensor
-        preserve_zero (bool): a flag to indicate whether we need zero to be exactly
-          representable or not, this is typically required for ops that needs zero padding, like convolution
-          it's less important for ops that doesn't have zero padding in the op itself, like linear.
-
-          For example, given a floating point Tensor [1.2, 0.1, 3.0, 4.0, 0.4, 0], if `preserve_zero` is True,
-          we'll make sure there is a integer value corresponding to the floating point 0, e.g. [-3, -8, 3, 7, -7, -8], 0 will be mapped to `-8` without loss. But if `preserve_zero` is not True, there won't be such
-          gurantee.
-
-          If we don't need zero to be exactly representable, we won't do rounding and clamping for zero_point
-
-        zero_point_domain (ZeroPointDomain): the domain that zero_point is in, should be either integer or float
-            if zero_point is in integer domain, zero point is added to the quantized integer value during
-            quantization
-            if zero_point is in floating point domain, zero point is subtracted from the floating point (unquantized)
-            value during quantization
-            default is ZeroPointDomain.INT
+        zero_point_dtype (torch.dtype): dtype for zero_point Tensor, defaults to torch.int32
 
     Output:
         Tuple of scales and zero_points Tensor with requested dtype
     """
-    if zero_point_domain is None:
-        raise ValueError("Please use ZeroPointDomain.NONE instead of None")
-
     return _choose_qparams_affine(
         input,
         mapping_type.name,
@@ -790,16 +943,146 @@ def choose_qparams_affine(
         eps,
         scale_dtype,
         zero_point_dtype,
-        preserve_zero,
-        zero_point_domain.name,
     )
 
 
+@torch.no_grad()
+def choose_qparams_affine_tinygemm(
+    input: torch.Tensor,
+    mapping_type: MappingType,
+    block_size: Tuple[int, ...],
+    target_dtype: torch.dtype,
+    quant_min: Optional[Union[int, float]] = None,
+    quant_max: Optional[Union[int, float]] = None,
+    eps: Optional[float] = None,
+    scale_dtype: Optional[torch.dtype] = None,
+    zero_point_dtype: Optional[torch.dtype] = None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Specialized version of choose_qparams_affine with zero_point_domain=ZeroPointDomain.FLOAT and preserve_zero=False.
+
+    This is used for tinygemm int4mm kernel where zero point is in floating point domain
+    and zero does not have to be exactly representable.
+
+    Args:
+        input (torch.Tensor): fp32, bf16, fp16 input Tensor
+        mapping_type (MappingType): determines how the qparams are calculated, symmetric or asymmetric
+        block_size: (Tuple[int, ...]): granularity of quantization, this means the size of the tensor elements that's sharing the same qparam
+        target_dtype (torch.dtype): dtype for target quantized Tensor
+        quant_min (Optional[int]): minimum quantized value for target quantized Tensor
+        quant_max (Optioanl[int]): maximum quantized value for target quantized Tensor
+        eps (Optional[float]): minimum scale, if not provided, default to eps of input.dtype
+        scale_dtype (torch.dtype): dtype for scale Tensor
+        zero_point_dtype (torch.dtype): dtype for zero_point Tensor
+
+    Output:
+        Tuple of scales and zero_points Tensor with requested dtype
+    """
+    quant_min, quant_max = _get_and_check_qmin_qmax(target_dtype, quant_min, quant_max)
+    assert mapping_type is MappingType.ASYMMETRIC, (
+        f"Unsupported mapping type: {mapping_type}"
+    )
+    if scale_dtype is None:
+        scale_dtype = input.dtype
+    if eps is None:
+        eps = torch.finfo(input.dtype).eps
+
+    assert len(block_size) == input.dim(), (
+        f"Got input dim:{input.dim()}, block_size: {block_size}"
+    )
+    shape_for_reduction, reduction_dims = _get_reduction_params(
+        block_size, input.size()
+    )
+    input = input.view(shape_for_reduction)
+
+    min_val = torch.amin(input, dim=reduction_dims, keepdim=False)
+    max_val = torch.amax(input, dim=reduction_dims, keepdim=False)
+
+    # For preserve_zero=False, we don't ensure zero is exactly representable
+    min_val_neg = min_val
+    max_val_pos = max_val
+
+    scale = (max_val_pos - min_val_neg) / float(quant_max - quant_min)
+    scale = torch.clamp(scale, min=eps)
+
+    # For zero_point_domain=FLOAT in asymmetric quantization
+    mid_point = (quant_max + quant_min + 1) / 2
+    # this is not preserving zero_point, this is converting to TensorCoreTiledFormat
+    zero_point = min_val_neg + scale * mid_point
+
+    if zero_point_dtype is None:
+        zero_point_dtype = input.dtype
+
+    zero_point = zero_point.to(dtype=zero_point_dtype)
+    return scale.to(dtype=scale_dtype), zero_point
+
+
+def choose_qparams_affine_dont_preserve_zero(
+    input: torch.Tensor,
+    mapping_type: MappingType,
+    block_size: List[int],
+    target_dtype: torch.dtype,
+    quant_min: Optional[Union[int, float, bool]] = None,
+    quant_max: Optional[Union[int, float, bool]] = None,
+    eps: Optional[float] = None,
+    scale_dtype: Optional[torch.dtype] = None,
+    zero_point_dtype: Optional[torch.dtype] = None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Specialized version of choose_qparams_affine with zero_point_domain=ZeroPointDomain.INT and preserve_zero=False.
+
+    Args:
+        input (torch.Tensor): fp32, bf16, fp16 input Tensor
+        mapping_type (MappingType): determines how the qparams are calculated, asymmetric only
+        block_size: (Tuple[int, ...]): granularity of quantization, this means the size of the tensor elements that's sharing the same qparam
+        target_dtype (torch.dtype): dtype for target quantized Tensor
+        quant_min (Optional[int]): minimum quantized value for target quantized Tensor
+        quant_max (Optioanl[int]): maximum quantized value for target quantized Tensor
+        eps (Optional[float]): minimum scale, if not provided, default to eps of input.dtype
+        scale_dtype (torch.dtype): dtype for scale Tensor
+        zero_point_dtype (torch.dtype): dtype for zero_point Tensor
+
+    Output:
+        Tuple of scales and zero_points Tensor with requested dtype
+    """
+    quant_min, quant_max = _get_and_check_qmin_qmax(target_dtype, quant_min, quant_max)
+    assert mapping_type == MappingType.ASYMMETRIC, (
+        f"Unsupported mapping type: {mapping_type}"
+    )
+
+    if scale_dtype is None:
+        scale_dtype = input.dtype
+    if eps is None:
+        eps = torch.finfo(input.dtype).eps
+
+    assert len(block_size) == input.dim(), (
+        f"Got input dim:{input.dim()}, block_size: {block_size}"
+    )
+    shape_for_reduction, reduction_dims = _get_reduction_params(
+        block_size, input.size()
+    )
+    input = input.view(shape_for_reduction)
+
+    min_val = torch.amin(input, dim=reduction_dims, keepdim=False)
+    max_val = torch.amax(input, dim=reduction_dims, keepdim=False)
+
+    # For no preserve zero, we don't ensure zero is exactly representable
+    min_val_neg = min_val
+    max_val_pos = max_val
+
+    scale = (max_val_pos - min_val_neg) / float(quant_max - quant_min)
+    scale = torch.clamp(scale, min=eps)
+    # Zero point is int
+    zero_point = quant_min - torch.round(min_val_neg / scale)
+    zero_point = torch.clamp(zero_point, quant_min, quant_max)
+    if zero_point_dtype is None:
+        zero_point_dtype = torch.int32
+    return scale.to(dtype=scale_dtype), zero_point.to(dtype=zero_point_dtype)
+
+
 def choose_qparams_affine_with_min_max(
     min_val: torch.Tensor,
     max_val: torch.Tensor,
     mapping_type: MappingType,
-    block_size: Tuple[int, ...],
     target_dtype: torch.dtype,
     quant_min: Optional[int] = None,
     quant_max: Optional[int] = None,
@@ -821,22 +1104,94 @@ def choose_qparams_affine_with_min_max(
     """
     if zero_point_domain is None:
         raise ValueError("Please use ZeroPointDomain.NONE instead of None")
-    return _choose_qparams_affine(
-        None,
-        mapping_type.name,
-        block_size,
-        target_dtype,
-        quant_min,
-        quant_max,
-        eps,
-        scale_dtype,
-        zero_point_dtype,
-        preserve_zero,
-        zero_point_domain.name,
-        min_val,
-        max_val,
+    quant_min, quant_max = _get_and_check_qmin_qmax(target_dtype, quant_min, quant_max)
+    assert mapping_type in [
+        MappingType.SYMMETRIC,
+        MappingType.SYMMETRIC_NO_CLIPPING_ERR,
+        MappingType.ASYMMETRIC,
+    ], f"Unsupported mapping type: {mapping_type}"
+
+    assert min_val is not None and max_val is not None, (
+        "Need to provide `min_val` and `max_val`, got: {min_val, max_val}"
+    )
+    assert min_val.dtype == max_val.dtype, (
+        "Expecting `min_val` and `max_val` to have the same dtype, got: {min_val.dtype, max_val.dtype}"
     )
 
+    if scale_dtype is None:
+        scale_dtype = min_val.dtype
+    if eps is None:
+        eps = torch.finfo(min_val.dtype).eps
+
+    if preserve_zero:
+        min_val_neg = torch.min(min_val, torch.zeros_like(min_val))
+        max_val_pos = torch.max(max_val, torch.zeros_like(max_val))
+    else:
+        min_val_neg = min_val
+        max_val_pos = max_val
+
+    if (
+        mapping_type == MappingType.SYMMETRIC
+        or mapping_type == MappingType.SYMMETRIC_NO_CLIPPING_ERR
+    ):
+        # scales
+        if mapping_type == MappingType.SYMMETRIC:
+            max_val_pos = torch.max(-min_val_neg, max_val_pos)
+            scale = max_val_pos / (float(quant_max - quant_min) / 2)
+        else:
+            assert mapping_type == MappingType.SYMMETRIC_NO_CLIPPING_ERR
+            # calculate smin and smax individually and choose the larger one. For example, if quant_min = -8 and
+            # quant_max = 7.
+            # - If smin is bigger: There would be coverage on negative values down to -8, and less rounding
+            # error than the existing SYMMETRIC case.
+            # - If smax is bigger: it covers the positive values up to 7. The round
+            # error may be bigger than the existing SYMMETRIC case. Either way, there's no out-of-range fp values after
+            # quantization.
+            smin = min_val_neg / float(quant_min)
+            smax = max_val_pos / float(quant_max)
+            mask = smin > smax
+            scale = torch.where(mask, smin, smax)
+        # zeros
+        if not preserve_zero:
+            raise ValueError(
+                "preserve_zero == False is not supported for symmetric quantization"
+            )
+        if zero_point_domain == ZeroPointDomain.FLOAT:
+            # TODO INT should not be a valid ZeroPointDomain for symmetric quantization since
+            # symmetric quant doesn't have a zero_point
+            raise ValueError(
+                "zero_point_domain should be ZeroPointDomain.INT or ZeroPointDomain.NONE for symmetric quantization"
+            )
+        if zero_point_domain == ZeroPointDomain.NONE:
+            zero_point = None
+        else:
+            zero_point = torch.full_like(scale, int((quant_max + quant_min + 1) / 2))
+        scale = torch.clamp(scale, min=eps)
+    else:
+        assert mapping_type == MappingType.ASYMMETRIC
+        scale = (max_val_pos - min_val_neg) / float(quant_max - quant_min)
+        scale = torch.clamp(scale, min=eps)
+        if zero_point_domain == ZeroPointDomain.NONE:
+            zero_point = None
+        elif zero_point_domain == ZeroPointDomain.INT:
+            zero_point = quant_min - torch.round(min_val_neg / scale)
+            zero_point = torch.clamp(zero_point, quant_min, quant_max)
+            if zero_point_dtype is None:
+                zero_point_dtype = torch.int32
+        else:
+            assert zero_point_domain == ZeroPointDomain.FLOAT, (
+                "zero_point must be in FLOAT/INT/None domain for asymmetric quantization"
+            )
+            mid_point = (quant_max + quant_min + 1) / 2
+            # this is not preserving zero_point, this is converting to TensorCoreTiledFormat
+            # TODO move the conversion of zero_point out of quant_primitives
+            # and into TensorCoreTiledLayout.from_plain
+            zero_point = min_val_neg + scale * mid_point
+
+    if zero_point is not None:
+        zero_point = zero_point.to(dtype=zero_point_dtype)
+    return scale.to(dtype=scale_dtype), zero_point
+
 
 @register_custom_op
 def _choose_qparams_affine(
@@ -868,46 +1223,25 @@ def _choose_qparams_affine(
         MappingType.SYMMETRIC_NO_CLIPPING_ERR.name,
         MappingType.ASYMMETRIC.name,
     ], f"Unsupported mapping type: {mapping_type}"
-    if target_dtype in FP8_TYPES:
-        assert mapping_type == MappingType.SYMMETRIC.name, (
-            f"Only symmetric quantization is supported for FP8 types, got {mapping_type}"
-        )
 
-    if input is not None:
-        if scale_dtype is None:
-            scale_dtype = input.dtype
-        if eps is None:
-            eps = torch.finfo(input.dtype).eps
+    if scale_dtype is None:
+        scale_dtype = input.dtype
+    if eps is None:
+        eps = torch.finfo(input.dtype).eps
 
-        assert len(block_size) == input.dim(), (
-            f"Got input dim:{input.dim()}, block_size: {block_size}"
-        )
-        shape_for_reduction, reduction_dims = _get_reduction_params(
-            block_size, input.size()
-        )
-        input = input.view(shape_for_reduction)
-
-        min_val = torch.amin(input, dim=reduction_dims, keepdim=False)
-        max_val = torch.amax(input, dim=reduction_dims, keepdim=False)
-    else:
-        assert min_val is not None and max_val is not None, (
-            "Need to provide `min_val` and `max_val` when `input` is None, got: {min_val, max_val}"
-        )
-        assert min_val.dtype == max_val.dtype, (
-            "Expecting `min_val` and `max_val` to have the same dtype, got: {min_val.dtype, max_val.dtype}"
-        )
+    assert len(block_size) == input.dim(), (
+        f"Got input dim:{input.dim()}, block_size: {block_size}"
+    )
+    shape_for_reduction, reduction_dims = _get_reduction_params(
+        block_size, input.size()
+    )
+    input = input.view(shape_for_reduction)
 
-        if scale_dtype is None:
-            scale_dtype = min_val.dtype
-        if eps is None:
-            eps = torch.finfo(min_val.dtype).eps
+    min_val = torch.amin(input, dim=reduction_dims, keepdim=False)
+    max_val = torch.amax(input, dim=reduction_dims, keepdim=False)
 
-    if preserve_zero:
-        min_val_neg = torch.min(min_val, torch.zeros_like(min_val))
-        max_val_pos = torch.max(max_val, torch.zeros_like(max_val))
-    else:
-        min_val_neg = min_val
-        max_val_pos = max_val
+    min_val_neg = torch.min(min_val, torch.zeros_like(min_val))
+    max_val_pos = torch.max(max_val, torch.zeros_like(max_val))
 
     if (
         mapping_type == MappingType.SYMMETRIC.name
@@ -930,46 +1264,18 @@ def _choose_qparams_affine(
             smax = max_val_pos / float(quant_max)
             mask = smin > smax
             scale = torch.where(mask, smin, smax)
-        # zeros
-        if not preserve_zero:
-            raise ValueError(
-                "preserve_zero == False is not supported for symmetric quantization"
-            )
-        if zero_point_domain == ZeroPointDomain.FLOAT.name:
-            # TODO INT should not be a valid ZeroPointDomain for symmetric quantization since
-            # symmetric quant doesn't have a zero_point
-            raise ValueError(
-                "zero_point_domain should be ZeroPointDomain.INT or ZeroPointDomain.NONE for symmetric quantization"
-            )
-        if zero_point_domain == ZeroPointDomain.NONE.name:
-            zero_point = None
-        else:
-            zero_point = torch.full_like(scale, int((quant_max + quant_min + 1) / 2))
+        zero_point = torch.full_like(scale, int((quant_max + quant_min + 1) / 2))
         scale = torch.clamp(scale, min=eps)
     else:
         assert mapping_type == MappingType.ASYMMETRIC.name
         scale = (max_val_pos - min_val_neg) / float(quant_max - quant_min)
         scale = torch.clamp(scale, min=eps)
-        if zero_point_domain == ZeroPointDomain.NONE.name:
-            zero_point = None
-        elif zero_point_domain == ZeroPointDomain.INT.name:
-            zero_point = quant_min - torch.round(min_val_neg / scale)
-            zero_point = torch.clamp(zero_point, quant_min, quant_max)
-            if zero_point_dtype is None:
-                zero_point_dtype = torch.int32
-        else:
-            assert zero_point_domain == ZeroPointDomain.FLOAT.name, (
-                "zero_point must be in FLOAT/INT/None domain for asymmetric quantization"
-            )
-            mid_point = (quant_max + quant_min + 1) / 2
-            # this is not preserving zero_point, this is converting to TensorCoreTiledFormat
-            # TODO move the conversion of zero_point out of quant_primitives
-            # and into TensorCoreTiledLayout.from_plain
-            zero_point = min_val_neg + scale * mid_point
+        zero_point = quant_min - torch.round(min_val_neg / scale)
+        zero_point = torch.clamp(zero_point, quant_min, quant_max)
+        if zero_point_dtype is None:
+            zero_point_dtype = torch.int32
 
-    if zero_point is not None:
-        zero_point = zero_point.to(dtype=zero_point_dtype)
-    return scale.to(dtype=scale_dtype), zero_point
+    return scale.to(dtype=scale_dtype), zero_point.to(dtype=zero_point_dtype)
 
 
 def choose_qparams_and_quantize_affine_qqq(
diff --git a/torchao/quantization/utils.py b/torchao/quantization/utils.py
index 0c30fba713..dd9a610167 100644
--- a/torchao/quantization/utils.py
+++ b/torchao/quantization/utils.py
@@ -16,8 +16,14 @@
     MappingType,
     ZeroPointDomain,
     choose_qparams_affine,
+    choose_qparams_affine_dont_preserve_zero,
+    choose_qparams_affine_tinygemm,
     dequantize_affine,
+    dequantize_affine_float_zero_point,
+    dequantize_affine_no_zero_point,
     quantize_affine,
+    quantize_affine_float_zero_point,
+    quantize_affine_no_zero_point,
 )
 from torchao.utils import (
     TORCH_VERSION_AT_LEAST_2_5,
@@ -343,19 +349,42 @@ def get_groupwise_affine_qparams(
         dtype if zero_point_domain != ZeroPointDomain.INT else torch.int32
     )
 
-    scale, zero_point = choose_qparams_affine(
-        w,
-        mapping_type,
-        block_size,
-        target_dtype,
-        quant_min,
-        quant_max,
-        eps,
-        scale_dtype=scale_dtype,
-        zero_point_dtype=zero_point_dtype,
-        preserve_zero=preserve_zero,
-        zero_point_domain=zero_point_domain,
-    )
+    if zero_point_domain == ZeroPointDomain.FLOAT and not preserve_zero:
+        scale, zero_point = choose_qparams_affine_tinygemm(
+            w,
+            mapping_type,
+            block_size,
+            target_dtype,
+            quant_min,
+            quant_max,
+            eps,
+            scale_dtype=scale_dtype,
+            zero_point_dtype=zero_point_dtype,
+        )
+    elif zero_point_domain == ZeroPointDomain.INT and not preserve_zero:
+        scale, zero_point = choose_qparams_affine_dont_preserve_zero(
+            w,
+            mapping_type,
+            block_size,
+            target_dtype,
+            quant_min,
+            quant_max,
+            eps,
+            scale_dtype=scale_dtype,
+            zero_point_dtype=zero_point_dtype,
+        )
+    else:  # Default case: zero_point_domain == ZeroPointDomain.INT and preserve_zero
+        scale, zero_point = choose_qparams_affine(
+            w,
+            mapping_type,
+            block_size,
+            target_dtype,
+            quant_min,
+            quant_max,
+            eps,
+            scale_dtype=scale_dtype,
+            zero_point_dtype=zero_point_dtype,
+        )
 
     return scale.to(dtype=dtype).reshape(w.shape[0], -1), zero_point.to(
         dtype=zero_point_dtype
@@ -418,7 +447,16 @@ def groupwise_affine_quantize_tensor_from_qparams(
     quant_min = 0
     quant_max = 2**n_bit - 1
 
-    int_data = quantize_affine(
+    if zero_point_domain == ZeroPointDomain.INT:
+        _quantize_affine = quantize_affine
+    elif zero_point_domain == ZeroPointDomain.FLOAT:
+        _quantize_affine = quantize_affine_float_zero_point
+    elif ZeroPointDomain == ZeroPointDomain.NONE:
+        _quantize_affine = quantize_affine_no_zero_point
+    else:
+        raise ValueError(f"Unrecognized zero point domain: {zero_point_domain}")
+
+    int_data = _quantize_affine(
         w,
         block_size,
         scales,
@@ -426,7 +464,6 @@ def groupwise_affine_quantize_tensor_from_qparams(
         output_dtype,
         quant_min,
         quant_max,
-        zero_point_domain=zero_point_domain,
     )
     if TORCH_VERSION_AT_LEAST_2_5 and w.shape[-1] > 1:
         if (not (check_cpu_version(int_data.device))) and (
@@ -474,7 +511,13 @@ def groupwise_affine_dequantize_tensor_from_qparams(
     input_dtype = torch.int32
     quant_min = 0
     quant_max = 2**n_bit - 1
-    return dequantize_affine(
+    if zero_point_domain == ZeroPointDomain.INT:
+        _dequantize_affine = dequantize_affine
+    elif zero_point_domain == ZeroPointDomain.FLOAT:
+        _dequantize_affine = dequantize_affine_float_zero_point
+    else:
+        _dequantize_affine = dequantize_affine_no_zero_point
+    return _dequantize_affine(
         w_int32,
         block_size,
         scales,
@@ -482,7 +525,6 @@ def groupwise_affine_dequantize_tensor_from_qparams(
         input_dtype,
         quant_min,
         quant_max,
-        zero_point_domain=zero_point_domain,
         output_dtype=scales.dtype,
     )