Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support AWQ models #1049

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions optimum/exporters/openvino/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,10 @@ def main_export(
trust_remote_code=trust_remote_code,
)
quantization_config = getattr(config, "quantization_config", None)
do_gptq_patching = quantization_config and quantization_config["quant_method"] == "gptq"
supported_quant_methods = ["gptq"]
if is_openvino_version(">=", "2024.6.0"):
supported_quant_methods.append("awq")
do_gptq_patching = quantization_config and quantization_config["quant_method"] in supported_quant_methods
model_type = config.model_type.replace("_", "-")
if model_type not in TasksManager._SUPPORTED_MODEL_TYPE:
custom_architecture = True
Expand Down Expand Up @@ -291,7 +294,6 @@ def main_export(
if (
dtype is None
and framework == "pt"
and not do_gptq_patching
and (
task.startswith("text-generation")
or getattr(config, "model_type", None) in MULTI_MODAL_TEXT_GENERATION_MODELS
Expand All @@ -311,7 +313,6 @@ def main_export(
loading_kwargs["torch_dtype"] = dtype
# Patch the modules to export of GPTQ models w/o GPU
if do_gptq_patching:
torch.set_default_dtype(torch.float32)
orig_cuda_check = torch.cuda.is_available
torch.cuda.is_available = lambda: True

Expand Down
6 changes: 5 additions & 1 deletion optimum/exporters/openvino/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -448,7 +448,11 @@ def ts_patched_forward(*args, **kwargs):
from openvino.frontend.pytorch.patch_model import unpatch_model

unpatch_model(model, "_openvino_module_extension_patch_orig_forward")
model.to(torch.float32)
for m in model.modules():
if any(p.dtype in [torch.float16, torch.bfloat16] for p in m.parameters(False)) or any(
b.dtype in [torch.float16, torch.bfloat16] for b in m.buffers(False)
):
m.float()

return export_pytorch_via_onnx(
model,
Expand Down
2 changes: 2 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@
]

TESTS_REQUIRE = [
"auto-gptq",
"autoawq",
"accelerate",
"pytest>=7.2.0,<8.0.0",
"parameterized",
Expand Down
47 changes: 34 additions & 13 deletions tests/openvino/test_modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@
)
from transformers.onnx.utils import get_preprocessor
from transformers.testing_utils import slow
from utils_tests import MODEL_NAMES, TEST_IMAGE_URL
from utils_tests import MODEL_NAMES, TEST_IMAGE_URL, mock_torch_cuda_is_available, patch_awq_for_inference

from optimum.exporters.openvino.model_patcher import patch_update_causal_mask
from optimum.intel import (
Expand Down Expand Up @@ -872,13 +872,13 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
"gpt_neo",
"gpt_neox",
"llama",
# "llama_gptq",
"marian",
"minicpm",
"mistral",
"mixtral",
"mpt",
"opt",
"opt_gptq",
"pegasus",
"qwen",
"phi",
Expand Down Expand Up @@ -917,6 +917,9 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
"minicpm3",
)

if is_openvino_version(">=", "2024.6.0"):
SUPPORTED_ARCHITECTURES += ("mixtral_awq",)

GENERATION_LENGTH = 100
REMOTE_CODE_MODELS = (
"chatglm",
Expand Down Expand Up @@ -949,9 +952,6 @@ def test_compare_to_transformers(self, model_arch):
if is_openvino_version("<", "2024.1"):
not_stateful.extend(["llama", "gemma", "gpt_bigcode"])

if "gptq" in model_arch:
self.skipTest("GPTQ model loading unsupported with AutoModelForCausalLM")

set_seed(SEED)

model_kwargs = {}
Expand All @@ -978,20 +978,30 @@ def test_compare_to_transformers(self, model_arch):
if is_stateful:
self.assertTrue(len(ov_outputs.past_key_values) == 1 and len(ov_outputs.past_key_values[0]) == 0)

if "awq" in model_arch or "gptq" in model_arch:
# infer in FP32
model_kwargs["torch_dtype"] = torch.float32

set_seed(SEED)
transformers_model = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs)
with mock_torch_cuda_is_available("awq" in model_arch or "gptq" in model_arch):
transformers_model = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs)
if model_arch in ["qwen", "arctic", "glm4"]:
transformers_model.to(torch.float32)

with torch.no_grad():
transformers_outputs = transformers_model(**tokens)
with patch_awq_for_inference("awq" in model_arch):
transformers_outputs = transformers_model(**tokens)

# Compare tensor outputs
atol = 1e-3 if model_arch == "minicpm" else 1e-4
# quantized models have higher tolerance
if "awq" in model_arch:
atol = 1e-2
elif "gptq" in model_arch:
atol = 0.6
self.assertTrue(torch.allclose(ov_outputs.logits, transformers_outputs.logits, equal_nan=True, atol=atol))

# Qwen tokenizer does not support padding

if model_arch in ["qwen"]:
return

Expand Down Expand Up @@ -1025,7 +1035,12 @@ def test_compare_to_transformers(self, model_arch):
from transformers.cache_utils import DynamicCache

additional_inputs = {"past_key_values": DynamicCache()}
transformers_outputs = transformers_model.generate(**tokens, generation_config=gen_config, **additional_inputs)
with patch_awq_for_inference("awq" in model_arch):
transformers_outputs = transformers_model.generate(
**tokens, generation_config=gen_config, **additional_inputs
)
print(f"ov_outputs: {ov_outputs}")
print(f"transformers_outputs: {transformers_outputs}")
self.assertTrue(
torch.allclose(ov_outputs, transformers_outputs),
"OV output {ov_outputs}\nTransformers output {transformers_output}",
Expand Down Expand Up @@ -1261,8 +1276,13 @@ def test_beam_search(self, model_arch):
ov_model_stateless = OVModelForCausalLM.from_pretrained(
model_id, export=True, use_cache=True, stateful=False, **model_kwargs
)
if "awq" in model_arch or "gptq" in model_arch:
# infer in FP32
model_kwargs["torch_dtype"] = torch.float32

set_seed(SEED)
transformers_model = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs)
with mock_torch_cuda_is_available("awq" in model_arch or "gptq" in model_arch):
transformers_model = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs)

if model_arch == "arctic":
transformers_model.to(torch.float32)
Expand All @@ -1288,9 +1308,10 @@ def test_beam_search(self, model_arch):

if model_arch == "gemma2":
additional_inputs = {"past_key_values": DynamicCache()}
transformers_outputs = transformers_model.generate(
**tokens, generation_config=gen_config, **additional_inputs
)
with patch_awq_for_inference("awq" in model_arch):
transformers_outputs = transformers_model.generate(
**tokens, generation_config=gen_config, **additional_inputs
)
set_seed(SEED)
ov_stateful_outputs = ov_model_stateful.generate(**tokens, generation_config=gen_config)
self.assertTrue(
Expand Down
59 changes: 58 additions & 1 deletion tests/openvino/utils_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import numpy as np
import openvino as ov
import torch
from contextlib import contextmanager


MODEL_NAMES = {
Expand Down Expand Up @@ -77,12 +78,12 @@
"longt5": "hf-internal-testing/tiny-random-longt5",
"llama": "HuggingFaceM4/tiny-random-LlamaForCausalLM",
"llama_awq": "HuggingFaceH4/tiny-random-LlamaForCausalLM",
"llama_gptq": "hf-internal-testing/TinyLlama-1.1B-Chat-v0.3-GPTQ",
"llava": "katuni4ka/tiny-random-llava",
"llava_next": "katuni4ka/tiny-random-llava-next",
"m2m_100": "hf-internal-testing/tiny-random-m2m_100",
"opt": "hf-internal-testing/tiny-random-OPTModel",
"opt125m": "facebook/opt-125m",
"opt_gptq": "ybelkada/opt-125m-gptq-4bit",
"marian": "sshleifer/tiny-marian-en-de",
"mbart": "hf-internal-testing/tiny-random-mbart",
"minicpm": "katuni4ka/tiny-random-minicpm",
Expand All @@ -91,6 +92,7 @@
"mistral": "echarlaix/tiny-random-mistral",
"mistral-nemo": "katuni4ka/tiny-random-mistral-nemo",
"mixtral": "TitanML/tiny-mixtral",
"mixtral_awq": "TitanML/tiny-mixtral-AWQ-4bit",
"mobilebert": "hf-internal-testing/tiny-random-MobileBertModel",
"mobilenet_v1": "google/mobilenet_v1_0.75_192",
"mobilenet_v2": "hf-internal-testing/tiny-random-MobileNetV2Model",
Expand Down Expand Up @@ -218,3 +220,58 @@ def get_num_quantized_nodes(model):
if type_name == "nf4":
num_weight_nodes["nf4"] += 1
return num_fake_quantize, num_weight_nodes


@contextmanager
def mock_torch_cuda_is_available(to_patch):
original_is_available = torch.cuda.is_available
if to_patch:
torch.cuda.is_available = lambda: True
try:
yield
finally:
if to_patch:
torch.cuda.is_available = original_is_available


@contextmanager
def patch_awq_for_inference(to_patch):
orig_gemm_forward = None
if to_patch:
# patch GEMM module to allow inference without CUDA GPU
from awq.modules.linear.gemm import WQLinearMMFunction
from awq.utils.packing_utils import dequantize_gemm

def new_forward(
ctx,
x,
qweight,
qzeros,
scales,
w_bit=4,
group_size=128,
bias=None,
out_features=0,
):
ctx.out_features = out_features

out_shape = x.shape[:-1] + (out_features,)
x = x.to(torch.float16)

out = dequantize_gemm(qweight, qzeros, scales, w_bit, group_size)
out = torch.matmul(x, out)

out = out + bias if bias is not None else out
out = out.reshape(out_shape)

if len(out.shape) == 2:
out = out.unsqueeze(0)
return out

orig_gemm_forward = WQLinearMMFunction.forward
WQLinearMMFunction.forward = new_forward
try:
yield
finally:
if orig_gemm_forward is not None:
WQLinearMMFunction.forward = orig_gemm_forward
Loading