From 64f64b041ab2b4a44bb4006f0527803c1d5c67cf Mon Sep 17 00:00:00 2001 From: Maxim Vafin Date: Wed, 4 Dec 2024 13:39:08 +0100 Subject: [PATCH 1/8] Support AWQ models --- optimum/exporters/openvino/__main__.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py index e4fe2a7a41..0e913e8637 100644 --- a/optimum/exporters/openvino/__main__.py +++ b/optimum/exporters/openvino/__main__.py @@ -242,7 +242,7 @@ def main_export( trust_remote_code=trust_remote_code, ) quantization_config = getattr(config, "quantization_config", None) - do_gptq_patching = quantization_config and quantization_config["quant_method"] == "gptq" + do_gptq_patching = quantization_config and quantization_config["quant_method"] in ["gptq", "awq"] model_type = config.model_type.replace("_", "-") if model_type not in TasksManager._SUPPORTED_MODEL_TYPE: custom_architecture = True @@ -291,7 +291,6 @@ def main_export( if ( dtype is None and framework == "pt" - and not do_gptq_patching and ( task.startswith("text-generation") or getattr(config, "model_type", None) in MULTI_MODAL_TEXT_GENERATION_MODELS @@ -311,7 +310,6 @@ def main_export( loading_kwargs["torch_dtype"] = dtype # Patch the modules to export of GPTQ models w/o GPU if do_gptq_patching: - torch.set_default_dtype(torch.float32) orig_cuda_check = torch.cuda.is_available torch.cuda.is_available = lambda: True From 86d9328ab6cfde60a97c99492c282abbe8cbd2d5 Mon Sep 17 00:00:00 2001 From: Maxim Vafin Date: Thu, 5 Dec 2024 16:37:02 +0100 Subject: [PATCH 2/8] Add tests --- optimum/exporters/openvino/convert.py | 7 +++- tests/openvino/test_modeling.py | 60 ++++++++++++++++++++++++--- tests/openvino/utils_tests.py | 3 +- 3 files changed, 62 insertions(+), 8 deletions(-) diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py index 6012e6cfb5..3f8a73df6f 100644 --- a/optimum/exporters/openvino/convert.py +++ b/optimum/exporters/openvino/convert.py @@ -447,8 +447,11 @@ def ts_patched_forward(*args, **kwargs): if patch_16bit_model: from openvino.frontend.pytorch.patch_model import unpatch_model - unpatch_model(model, "_openvino_module_extension_patch_orig_forward") - model.to(torch.float32) + unpatch_model(model, "_openvino_module_extension_patch_orig_forward") + for m in model.modules(): + if (any(p.dtype in [torch.float16, torch.bfloat16] for p in m.parameters()) + or any(b.dtype in [torch.float16, torch.bfloat16] for b in m.buffers())): + m.float() return export_pytorch_via_onnx( model, diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index 240f4f9e3f..71139547f5 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -872,13 +872,14 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): "gpt_neo", "gpt_neox", "llama", - # "llama_gptq", "marian", "minicpm", "mistral", "mixtral", + "mixtral_awq", "mpt", "opt", + "opt_gptq", "pegasus", "qwen", "phi", @@ -949,9 +950,6 @@ def test_compare_to_transformers(self, model_arch): if is_openvino_version("<", "2024.1"): not_stateful.extend(["llama", "gemma", "gpt_bigcode"]) - if "gptq" in model_arch: - self.skipTest("GPTQ model loading unsupported with AutoModelForCausalLM") - set_seed(SEED) model_kwargs = {} @@ -978,6 +976,46 @@ def test_compare_to_transformers(self, model_arch): if is_stateful: self.assertTrue(len(ov_outputs.past_key_values) == 1 and len(ov_outputs.past_key_values[0]) == 0) + if "awq" in model_arch or "gptq" in model_arch: + orig_cuda_is_available = torch.cuda.is_available + torch.cuda.is_available = lambda: True + # infer in FP32 + model_kwargs["torch_dtype"] = torch.float32 + + if "awq" in model_arch: + # patch GEMM module to allow inference without CUDA GPU + from awq.modules.linear.gemm import WQLinearMMFunction + from awq.utils.packing_utils import dequantize_gemm + + def new_forward( + ctx, + x, + qweight, + qzeros, + scales, + w_bit=4, + group_size=128, + bias=None, + out_features=0, + ): + ctx.out_features = out_features + + out_shape = x.shape[:-1] + (out_features,) + x = x.to(torch.float16) + + out = dequantize_gemm(qweight, qzeros, scales, w_bit, group_size) + out = torch.matmul(x, out) + + out = out + bias if bias is not None else out + out = out.reshape(out_shape) + + if len(out.shape) == 2: + out = out.unsqueeze(0) + return out + + orig_gemm_forward = WQLinearMMFunction.forward + WQLinearMMFunction.forward = new_forward + set_seed(SEED) transformers_model = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs) if model_arch in ["qwen", "arctic", "glm4"]: @@ -988,10 +1026,14 @@ def test_compare_to_transformers(self, model_arch): # Compare tensor outputs atol = 1e-3 if model_arch == "minicpm" else 1e-4 + # quantized models have higher tolerance + if "awq" in model_arch: + atol = 1e-2 + elif "gptq" in model_arch: + atol = 0.6 self.assertTrue(torch.allclose(ov_outputs.logits, transformers_outputs.logits, equal_nan=True, atol=atol)) # Qwen tokenizer does not support padding - if model_arch in ["qwen"]: return @@ -1026,11 +1068,19 @@ def test_compare_to_transformers(self, model_arch): additional_inputs = {"past_key_values": DynamicCache()} transformers_outputs = transformers_model.generate(**tokens, generation_config=gen_config, **additional_inputs) + print(f"ov_outputs: {ov_outputs}") + print(f"transformers_outputs: {transformers_outputs}") self.assertTrue( torch.allclose(ov_outputs, transformers_outputs), "OV output {ov_outputs}\nTransformers output {transformers_output}", ) + if "awq" in model_arch: + WQLinearMMFunction.forward = orig_gemm_forward + + if "awq" in model_arch or "gptq" in model_arch: + torch.cuda.is_available = orig_cuda_is_available + del transformers_model del ov_model gc.collect() diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index 17d9dd1fbe..a725cb3d2d 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -77,12 +77,12 @@ "longt5": "hf-internal-testing/tiny-random-longt5", "llama": "HuggingFaceM4/tiny-random-LlamaForCausalLM", "llama_awq": "HuggingFaceH4/tiny-random-LlamaForCausalLM", - "llama_gptq": "hf-internal-testing/TinyLlama-1.1B-Chat-v0.3-GPTQ", "llava": "katuni4ka/tiny-random-llava", "llava_next": "katuni4ka/tiny-random-llava-next", "m2m_100": "hf-internal-testing/tiny-random-m2m_100", "opt": "hf-internal-testing/tiny-random-OPTModel", "opt125m": "facebook/opt-125m", + "opt_gptq": "katuni4ka/opt-125m-gptq", "marian": "sshleifer/tiny-marian-en-de", "mbart": "hf-internal-testing/tiny-random-mbart", "minicpm": "katuni4ka/tiny-random-minicpm", @@ -91,6 +91,7 @@ "mistral": "echarlaix/tiny-random-mistral", "mistral-nemo": "katuni4ka/tiny-random-mistral-nemo", "mixtral": "TitanML/tiny-mixtral", + "mixtral_awq": "TitanML/tiny-mixtral-AWQ-4bit", "mobilebert": "hf-internal-testing/tiny-random-MobileBertModel", "mobilenet_v1": "google/mobilenet_v1_0.75_192", "mobilenet_v2": "hf-internal-testing/tiny-random-MobileNetV2Model", From decbcc203bc5928a1b52c65a1e8ff917ff66d1e1 Mon Sep 17 00:00:00 2001 From: Maxim Vafin Date: Thu, 5 Dec 2024 17:47:57 +0100 Subject: [PATCH 3/8] Add dependencies --- setup.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/setup.py b/setup.py index cd49ea041a..8e1661f4e2 100644 --- a/setup.py +++ b/setup.py @@ -38,6 +38,8 @@ ] TESTS_REQUIRE = [ + "auto-gptq", + "autoawq", "accelerate", "pytest>=7.2.0,<8.0.0", "parameterized", From 9fb1da4b3f0c0f8160f865de17eb1f70f2d56ad5 Mon Sep 17 00:00:00 2001 From: Maxim Vafin Date: Wed, 11 Dec 2024 17:24:23 +0100 Subject: [PATCH 4/8] Fix tests --- optimum/exporters/openvino/convert.py | 7 +-- tests/openvino/test_modeling.py | 67 +++++++-------------------- tests/openvino/utils_tests.py | 58 ++++++++++++++++++++++- 3 files changed, 78 insertions(+), 54 deletions(-) diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py index 3f8a73df6f..e7cdbfbc9a 100644 --- a/optimum/exporters/openvino/convert.py +++ b/optimum/exporters/openvino/convert.py @@ -447,10 +447,11 @@ def ts_patched_forward(*args, **kwargs): if patch_16bit_model: from openvino.frontend.pytorch.patch_model import unpatch_model - unpatch_model(model, "_openvino_module_extension_patch_orig_forward") + unpatch_model(model, "_openvino_module_extension_patch_orig_forward") for m in model.modules(): - if (any(p.dtype in [torch.float16, torch.bfloat16] for p in m.parameters()) - or any(b.dtype in [torch.float16, torch.bfloat16] for b in m.buffers())): + if any(p.dtype in [torch.float16, torch.bfloat16] for p in m.parameters(False)) or any( + b.dtype in [torch.float16, torch.bfloat16] for b in m.buffers(False) + ): m.float() return export_pytorch_via_onnx( diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index 71139547f5..38ebb13bfa 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -62,7 +62,7 @@ ) from transformers.onnx.utils import get_preprocessor from transformers.testing_utils import slow -from utils_tests import MODEL_NAMES, TEST_IMAGE_URL +from utils_tests import MODEL_NAMES, TEST_IMAGE_URL, mock_torch_cuda_is_available, patch_awq_for_inference from optimum.exporters.openvino.model_patcher import patch_update_causal_mask from optimum.intel import ( @@ -977,52 +977,18 @@ def test_compare_to_transformers(self, model_arch): self.assertTrue(len(ov_outputs.past_key_values) == 1 and len(ov_outputs.past_key_values[0]) == 0) if "awq" in model_arch or "gptq" in model_arch: - orig_cuda_is_available = torch.cuda.is_available - torch.cuda.is_available = lambda: True # infer in FP32 model_kwargs["torch_dtype"] = torch.float32 - if "awq" in model_arch: - # patch GEMM module to allow inference without CUDA GPU - from awq.modules.linear.gemm import WQLinearMMFunction - from awq.utils.packing_utils import dequantize_gemm - - def new_forward( - ctx, - x, - qweight, - qzeros, - scales, - w_bit=4, - group_size=128, - bias=None, - out_features=0, - ): - ctx.out_features = out_features - - out_shape = x.shape[:-1] + (out_features,) - x = x.to(torch.float16) - - out = dequantize_gemm(qweight, qzeros, scales, w_bit, group_size) - out = torch.matmul(x, out) - - out = out + bias if bias is not None else out - out = out.reshape(out_shape) - - if len(out.shape) == 2: - out = out.unsqueeze(0) - return out - - orig_gemm_forward = WQLinearMMFunction.forward - WQLinearMMFunction.forward = new_forward - set_seed(SEED) - transformers_model = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs) + with mock_torch_cuda_is_available("awq" in model_arch or "gptq" in model_arch): + transformers_model = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs) if model_arch in ["qwen", "arctic", "glm4"]: transformers_model.to(torch.float32) with torch.no_grad(): - transformers_outputs = transformers_model(**tokens) + with patch_awq_for_inference("awq" in model_arch): + transformers_outputs = transformers_model(**tokens) # Compare tensor outputs atol = 1e-3 if model_arch == "minicpm" else 1e-4 @@ -1067,7 +1033,8 @@ def new_forward( from transformers.cache_utils import DynamicCache additional_inputs = {"past_key_values": DynamicCache()} - transformers_outputs = transformers_model.generate(**tokens, generation_config=gen_config, **additional_inputs) + with patch_awq_for_inference("awq" in model_arch): + transformers_outputs = transformers_model.generate(**tokens, generation_config=gen_config, **additional_inputs) print(f"ov_outputs: {ov_outputs}") print(f"transformers_outputs: {transformers_outputs}") self.assertTrue( @@ -1075,12 +1042,6 @@ def new_forward( "OV output {ov_outputs}\nTransformers output {transformers_output}", ) - if "awq" in model_arch: - WQLinearMMFunction.forward = orig_gemm_forward - - if "awq" in model_arch or "gptq" in model_arch: - torch.cuda.is_available = orig_cuda_is_available - del transformers_model del ov_model gc.collect() @@ -1311,8 +1272,13 @@ def test_beam_search(self, model_arch): ov_model_stateless = OVModelForCausalLM.from_pretrained( model_id, export=True, use_cache=True, stateful=False, **model_kwargs ) + if "awq" in model_arch or "gptq" in model_arch: + # infer in FP32 + model_kwargs["torch_dtype"] = torch.float32 + set_seed(SEED) - transformers_model = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs) + with mock_torch_cuda_is_available("awq" in model_arch or "gptq" in model_arch): + transformers_model = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs) if model_arch == "arctic": transformers_model.to(torch.float32) @@ -1338,9 +1304,10 @@ def test_beam_search(self, model_arch): if model_arch == "gemma2": additional_inputs = {"past_key_values": DynamicCache()} - transformers_outputs = transformers_model.generate( - **tokens, generation_config=gen_config, **additional_inputs - ) + with patch_awq_for_inference("awq" in model_arch): + transformers_outputs = transformers_model.generate( + **tokens, generation_config=gen_config, **additional_inputs + ) set_seed(SEED) ov_stateful_outputs = ov_model_stateful.generate(**tokens, generation_config=gen_config) self.assertTrue( diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index a725cb3d2d..fba13326f1 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -15,6 +15,7 @@ import numpy as np import openvino as ov import torch +from contextlib import contextmanager MODEL_NAMES = { @@ -82,7 +83,7 @@ "m2m_100": "hf-internal-testing/tiny-random-m2m_100", "opt": "hf-internal-testing/tiny-random-OPTModel", "opt125m": "facebook/opt-125m", - "opt_gptq": "katuni4ka/opt-125m-gptq", + "opt_gptq": "ybelkada/opt-125m-gptq-4bit", "marian": "sshleifer/tiny-marian-en-de", "mbart": "hf-internal-testing/tiny-random-mbart", "minicpm": "katuni4ka/tiny-random-minicpm", @@ -219,3 +220,58 @@ def get_num_quantized_nodes(model): if type_name == "nf4": num_weight_nodes["nf4"] += 1 return num_fake_quantize, num_weight_nodes + + +@contextmanager +def mock_torch_cuda_is_available(to_patch): + original_is_available = torch.cuda.is_available + if to_patch: + torch.cuda.is_available = lambda: True + try: + yield + finally: + if to_patch: + torch.cuda.is_available = original_is_available + + +@contextmanager +def patch_awq_for_inference(to_patch): + orig_gemm_forward = None + if to_patch: + # patch GEMM module to allow inference without CUDA GPU + from awq.modules.linear.gemm import WQLinearMMFunction + from awq.utils.packing_utils import dequantize_gemm + + def new_forward( + ctx, + x, + qweight, + qzeros, + scales, + w_bit=4, + group_size=128, + bias=None, + out_features=0, + ): + ctx.out_features = out_features + + out_shape = x.shape[:-1] + (out_features,) + x = x.to(torch.float16) + + out = dequantize_gemm(qweight, qzeros, scales, w_bit, group_size) + out = torch.matmul(x, out) + + out = out + bias if bias is not None else out + out = out.reshape(out_shape) + + if len(out.shape) == 2: + out = out.unsqueeze(0) + return out + + orig_gemm_forward = WQLinearMMFunction.forward + WQLinearMMFunction.forward = new_forward + try: + yield + finally: + if orig_gemm_forward is not None: + WQLinearMMFunction.forward = orig_gemm_forward From 04d0cf90aa468e16f3ca6324e7879196b9dbccfc Mon Sep 17 00:00:00 2001 From: eaidova Date: Tue, 17 Dec 2024 19:35:19 +0400 Subject: [PATCH 5/8] enable awq export only if ov support it --- optimum/exporters/openvino/__main__.py | 5 ++++- tests/openvino/test_modeling.py | 8 ++++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py index 0e913e8637..42d3d94064 100644 --- a/optimum/exporters/openvino/__main__.py +++ b/optimum/exporters/openvino/__main__.py @@ -242,7 +242,10 @@ def main_export( trust_remote_code=trust_remote_code, ) quantization_config = getattr(config, "quantization_config", None) - do_gptq_patching = quantization_config and quantization_config["quant_method"] in ["gptq", "awq"] + supported_quant_methods = ["gptq"] + if is_openvino_version(">=", "2024.6.0"): + supported_quant_methods.append("awq") + do_gptq_patching = quantization_config and quantization_config["quant_method"] in supported_quant_methods model_type = config.model_type.replace("_", "-") if model_type not in TasksManager._SUPPORTED_MODEL_TYPE: custom_architecture = True diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index 38ebb13bfa..e1f1cecda5 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -876,7 +876,6 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): "minicpm", "mistral", "mixtral", - "mixtral_awq", "mpt", "opt", "opt_gptq", @@ -918,6 +917,9 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): "minicpm3", ) + if is_openvino_version(">=", "2024.6.0"): + SUPPORTED_ARCHITECTURES += ("mixtral_awq",) + GENERATION_LENGTH = 100 REMOTE_CODE_MODELS = ( "chatglm", @@ -1034,7 +1036,9 @@ def test_compare_to_transformers(self, model_arch): additional_inputs = {"past_key_values": DynamicCache()} with patch_awq_for_inference("awq" in model_arch): - transformers_outputs = transformers_model.generate(**tokens, generation_config=gen_config, **additional_inputs) + transformers_outputs = transformers_model.generate( + **tokens, generation_config=gen_config, **additional_inputs + ) print(f"ov_outputs: {ov_outputs}") print(f"transformers_outputs: {transformers_outputs}") self.assertTrue( From df97004c83502313e46490a007585c6254834e6c Mon Sep 17 00:00:00 2001 From: Ekaterina Aidova Date: Tue, 17 Dec 2024 19:47:22 +0400 Subject: [PATCH 6/8] fix style (#2) --- tests/openvino/utils_tests.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index fba13326f1..0e748e7148 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -12,10 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. +from contextlib import contextmanager + import numpy as np import openvino as ov import torch -from contextlib import contextmanager MODEL_NAMES = { From cf2fc8b6e33d3138aded3466a53fa22df6841138 Mon Sep 17 00:00:00 2001 From: Ekaterina Aidova Date: Tue, 17 Dec 2024 20:33:51 +0400 Subject: [PATCH 7/8] disable awq and gptq install for old torch (#3) * fix style * disable autogptq and autoawq install for old transformers testing --- .github/workflows/test_openvino.yml | 5 +++++ .github/workflows/test_openvino_slow.yml | 5 +++++ setup.py | 2 -- tests/openvino/test_modeling.py | 2 +- 4 files changed, 11 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml index e2889cb4e0..eca8233988 100644 --- a/.github/workflows/test_openvino.yml +++ b/.github/workflows/test_openvino.yml @@ -49,6 +49,11 @@ jobs: name: Downgrade Transformers and Accelerate run: | pip install transformers==${{ matrix.transformers-version }} accelerate==0.* + + - if: ${{ matrix.transformers-version == 'latest' && matrix.test-pattern == '*modeling*'}} + name: Install auto-gptq, autoawq + run: | + pip install auto-gptq autoawq --extra-index-url https://download.pytorch.org/whl/cpu - if: ${{ matrix.test-pattern == '*modeling*' }} name: Uninstall NNCF diff --git a/.github/workflows/test_openvino_slow.yml b/.github/workflows/test_openvino_slow.yml index bf52413a7d..ccb564bb33 100644 --- a/.github/workflows/test_openvino_slow.yml +++ b/.github/workflows/test_openvino_slow.yml @@ -56,6 +56,11 @@ jobs: name: Downgrade Transformers and Accelerate run: pip install transformers==${{ matrix.transformers-version }} accelerate==0.* + - if: ${{ matrix.transformers-version == 'latest' }} + name: Install auto-gptq, autoawq + run: | + pip install auto-gptq autoawq --extra-index-url https://download.pytorch.org/whl/cpu + - name: Pip freeze run: pip freeze diff --git a/setup.py b/setup.py index 8e1661f4e2..cd49ea041a 100644 --- a/setup.py +++ b/setup.py @@ -38,8 +38,6 @@ ] TESTS_REQUIRE = [ - "auto-gptq", - "autoawq", "accelerate", "pytest>=7.2.0,<8.0.0", "parameterized", diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index e1f1cecda5..8927da1ab4 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -878,7 +878,6 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): "mixtral", "mpt", "opt", - "opt_gptq", "pegasus", "qwen", "phi", @@ -915,6 +914,7 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): "exaone", "mistral-nemo", "minicpm3", + "opt_gptq", ) if is_openvino_version(">=", "2024.6.0"): From f0f7a722c6d853e2c20ceaad3bf9800d505ea164 Mon Sep 17 00:00:00 2001 From: Ekaterina Aidova Date: Wed, 18 Dec 2024 12:50:56 +0400 Subject: [PATCH 8/8] separate common quant models patching and gptq (#4) --- optimum/exporters/openvino/__main__.py | 36 ++++++++++++++------------ 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py index 7940ef567c..3015b20e83 100644 --- a/optimum/exporters/openvino/__main__.py +++ b/optimum/exporters/openvino/__main__.py @@ -232,6 +232,7 @@ def main_export( ) do_gptq_patching = False + do_quant_patching = False custom_architecture = False patch_16bit = False loading_kwargs = model_loading_kwargs or {} @@ -250,7 +251,8 @@ def main_export( supported_quant_methods = ["gptq"] if is_openvino_version(">=", "2024.6.0"): supported_quant_methods.append("awq") - do_gptq_patching = quantization_config and quantization_config["quant_method"] in supported_quant_methods + do_quant_patching = quantization_config and quantization_config["quant_method"] in supported_quant_methods + do_gptq_patching = do_quant_patching and quantization_config["quant_method"] == "gptq" model_type = config.model_type.replace("_", "-") if model_type not in TasksManager._SUPPORTED_MODEL_TYPE: custom_architecture = True @@ -317,27 +319,28 @@ def main_export( patch_16bit = True loading_kwargs["torch_dtype"] = dtype # Patch the modules to export of GPTQ models w/o GPU - if do_gptq_patching: + if do_quant_patching: orig_cuda_check = torch.cuda.is_available torch.cuda.is_available = lambda: True - from optimum.gptq import GPTQQuantizer + if do_gptq_patching: + from optimum.gptq import GPTQQuantizer - orig_post_init_model = GPTQQuantizer.post_init_model + orig_post_init_model = GPTQQuantizer.post_init_model - def post_init_model(self, model): - from auto_gptq import exllama_set_max_input_length + def post_init_model(self, model): + from auto_gptq import exllama_set_max_input_length - class StoreAttr(object): - pass + class StoreAttr(object): + pass - model.quantize_config = StoreAttr() - model.quantize_config.desc_act = self.desc_act - if self.desc_act and not self.disable_exllama and self.max_input_length is not None: - model = exllama_set_max_input_length(model, self.max_input_length) - return model + model.quantize_config = StoreAttr() + model.quantize_config.desc_act = self.desc_act + if self.desc_act and not self.disable_exllama and self.max_input_length is not None: + model = exllama_set_max_input_length(model, self.max_input_length) + return model - GPTQQuantizer.post_init_model = post_init_model + GPTQQuantizer.post_init_model = post_init_model elif library_name == "diffusers" and is_openvino_version(">=", "2024.6"): dtype = deduce_diffusers_dtype( model_name_or_path, @@ -486,9 +489,10 @@ class StoreAttr(object): compressed_submodel_path.with_suffix(".bin").rename(submodel_path.with_suffix(".bin")) # Unpatch modules after GPTQ export - if do_gptq_patching: + if do_quant_patching: torch.cuda.is_available = orig_cuda_check - GPTQQuantizer.post_init_model = orig_post_init_model + if do_gptq_patching: + GPTQQuantizer.post_init_model = orig_post_init_model def maybe_convert_tokenizers(library_name: str, output: Path, model=None, preprocessors=None, task=None):