Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support AWQ models #1049

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .github/workflows/test_openvino.yml
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,11 @@ jobs:
name: Install specific dependencies and versions required for older transformers
run: |
pip install transformers==${{ matrix.transformers-version }} accelerate==0.* peft==0.13.* diffusers==0.30.* transformers_stream_generator
- if: ${{ matrix.transformers-version == 'latest' && matrix.test-pattern == '*modeling*'}}
name: Install auto-gptq, autoawq
run: |
pip install auto-gptq autoawq --extra-index-url https://download.pytorch.org/whl/cpu
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

these are not valid extra urls for auto-gptq and awq

Copy link
Collaborator

@eaidova eaidova Dec 18, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is needed for preventing reinstalling torch with cuda during installing third-party, packages themselves should be installed from regular source, torch-dependent libs (the difference from --index-url and --extra-index-url that first redefine source index completely, the second one parameter used for usage index URL as additional source if library present in that source) will be tried to install from torch cpu url

- if: ${{ matrix.test-pattern == '*modeling*' }}
name: Uninstall NNCF
Expand Down
5 changes: 5 additions & 0 deletions .github/workflows/test_openvino_slow.yml
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,11 @@ jobs:
name: Install specific dependencies and versions required for older transformers
run: pip install transformers==${{ matrix.transformers-version }} accelerate==0.* peft==0.13.*, diffusers==0.30.* transformers_stream_generator

- if: ${{ matrix.transformers-version == 'latest' }}
name: Install auto-gptq, autoawq
run: |
pip install auto-gptq autoawq --extra-index-url https://download.pytorch.org/whl/cpu

- name: Pip freeze
run: pip freeze

Expand Down
7 changes: 4 additions & 3 deletions optimum/exporters/openvino/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,10 @@ def main_export(
trust_remote_code=trust_remote_code,
)
quantization_config = getattr(config, "quantization_config", None)
do_gptq_patching = quantization_config and quantization_config["quant_method"] == "gptq"
supported_quant_methods = ["gptq"]
if is_openvino_version(">=", "2024.6.0"):
supported_quant_methods.append("awq")
do_gptq_patching = quantization_config and quantization_config["quant_method"] in supported_quant_methods
model_type = config.model_type.replace("_", "-")
if model_type not in TasksManager._SUPPORTED_MODEL_TYPE:
custom_architecture = True
Expand Down Expand Up @@ -296,7 +299,6 @@ def main_export(
if (
dtype is None
and framework == "pt"
and not do_gptq_patching
and (
task.startswith("text-generation")
or getattr(config, "model_type", None) in MULTI_MODAL_TEXT_GENERATION_MODELS
Expand All @@ -316,7 +318,6 @@ def main_export(
loading_kwargs["torch_dtype"] = dtype
# Patch the modules to export of GPTQ models w/o GPU
if do_gptq_patching:
torch.set_default_dtype(torch.float32)
orig_cuda_check = torch.cuda.is_available
torch.cuda.is_available = lambda: True

Expand Down
6 changes: 5 additions & 1 deletion optimum/exporters/openvino/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -450,7 +450,11 @@ def ts_patched_forward(*args, **kwargs):
from openvino.frontend.pytorch.patch_model import unpatch_model

unpatch_model(model, "_openvino_module_extension_patch_orig_forward")
model.to(torch.float32)
for m in model.modules():
if any(p.dtype in [torch.float16, torch.bfloat16] for p in m.parameters(False)) or any(
b.dtype in [torch.float16, torch.bfloat16] for b in m.buffers(False)
):
m.float()

return export_pytorch_via_onnx(
model,
Expand Down
47 changes: 34 additions & 13 deletions tests/openvino/test_modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@
)
from transformers.onnx.utils import get_preprocessor
from transformers.testing_utils import slow
from utils_tests import MODEL_NAMES, TEST_IMAGE_URL
from utils_tests import MODEL_NAMES, TEST_IMAGE_URL, mock_torch_cuda_is_available, patch_awq_for_inference

from optimum.exporters.openvino.model_patcher import patch_update_causal_mask
from optimum.intel import (
Expand Down Expand Up @@ -872,7 +872,6 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
"gpt_neo",
"gpt_neox",
"llama",
# "llama_gptq",
"marian",
"minicpm",
"mistral",
Expand Down Expand Up @@ -915,8 +914,12 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
"exaone",
"mistral-nemo",
"minicpm3",
"opt_gptq",
)

if is_openvino_version(">=", "2024.6.0"):
SUPPORTED_ARCHITECTURES += ("mixtral_awq",)

GENERATION_LENGTH = 100
REMOTE_CODE_MODELS = (
"chatglm",
Expand Down Expand Up @@ -949,9 +952,6 @@ def test_compare_to_transformers(self, model_arch):
if is_openvino_version("<", "2024.1"):
not_stateful.extend(["llama", "gemma", "gpt_bigcode"])

if "gptq" in model_arch:
self.skipTest("GPTQ model loading unsupported with AutoModelForCausalLM")

set_seed(SEED)

model_kwargs = {}
Expand All @@ -978,20 +978,30 @@ def test_compare_to_transformers(self, model_arch):
if is_stateful:
self.assertTrue(len(ov_outputs.past_key_values) == 1 and len(ov_outputs.past_key_values[0]) == 0)

if "awq" in model_arch or "gptq" in model_arch:
# infer in FP32
model_kwargs["torch_dtype"] = torch.float32

set_seed(SEED)
transformers_model = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs)
with mock_torch_cuda_is_available("awq" in model_arch or "gptq" in model_arch):
transformers_model = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs)
if model_arch in ["qwen", "arctic", "glm4"]:
transformers_model.to(torch.float32)

with torch.no_grad():
transformers_outputs = transformers_model(**tokens)
with patch_awq_for_inference("awq" in model_arch):
transformers_outputs = transformers_model(**tokens)

# Compare tensor outputs
atol = 1e-3 if model_arch == "minicpm" else 1e-4
# quantized models have higher tolerance
if "awq" in model_arch:
atol = 1e-2
elif "gptq" in model_arch:
atol = 0.6
self.assertTrue(torch.allclose(ov_outputs.logits, transformers_outputs.logits, equal_nan=True, atol=atol))

# Qwen tokenizer does not support padding

if model_arch in ["qwen"]:
return

Expand Down Expand Up @@ -1025,7 +1035,12 @@ def test_compare_to_transformers(self, model_arch):
from transformers.cache_utils import DynamicCache

additional_inputs = {"past_key_values": DynamicCache()}
transformers_outputs = transformers_model.generate(**tokens, generation_config=gen_config, **additional_inputs)
with patch_awq_for_inference("awq" in model_arch):
transformers_outputs = transformers_model.generate(
**tokens, generation_config=gen_config, **additional_inputs
)
print(f"ov_outputs: {ov_outputs}")
print(f"transformers_outputs: {transformers_outputs}")
self.assertTrue(
torch.allclose(ov_outputs, transformers_outputs),
"OV output {ov_outputs}\nTransformers output {transformers_output}",
Expand Down Expand Up @@ -1261,8 +1276,13 @@ def test_beam_search(self, model_arch):
ov_model_stateless = OVModelForCausalLM.from_pretrained(
model_id, export=True, use_cache=True, stateful=False, **model_kwargs
)
if "awq" in model_arch or "gptq" in model_arch:
# infer in FP32
model_kwargs["torch_dtype"] = torch.float32

set_seed(SEED)
transformers_model = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs)
with mock_torch_cuda_is_available("awq" in model_arch or "gptq" in model_arch):
transformers_model = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs)

if model_arch == "arctic":
transformers_model.to(torch.float32)
Expand All @@ -1288,9 +1308,10 @@ def test_beam_search(self, model_arch):

if model_arch == "gemma2":
additional_inputs = {"past_key_values": DynamicCache()}
transformers_outputs = transformers_model.generate(
**tokens, generation_config=gen_config, **additional_inputs
)
with patch_awq_for_inference("awq" in model_arch):
transformers_outputs = transformers_model.generate(
**tokens, generation_config=gen_config, **additional_inputs
)
set_seed(SEED)
ov_stateful_outputs = ov_model_stateful.generate(**tokens, generation_config=gen_config)
self.assertTrue(
Expand Down
60 changes: 59 additions & 1 deletion tests/openvino/utils_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from contextlib import contextmanager

import numpy as np
import openvino as ov
import torch
Expand Down Expand Up @@ -77,12 +79,12 @@
"longt5": "hf-internal-testing/tiny-random-longt5",
"llama": "HuggingFaceM4/tiny-random-LlamaForCausalLM",
"llama_awq": "HuggingFaceH4/tiny-random-LlamaForCausalLM",
"llama_gptq": "hf-internal-testing/TinyLlama-1.1B-Chat-v0.3-GPTQ",
"llava": "katuni4ka/tiny-random-llava",
"llava_next": "katuni4ka/tiny-random-llava-next",
"m2m_100": "hf-internal-testing/tiny-random-m2m_100",
"opt": "hf-internal-testing/tiny-random-OPTModel",
"opt125m": "facebook/opt-125m",
"opt_gptq": "ybelkada/opt-125m-gptq-4bit",
"marian": "sshleifer/tiny-marian-en-de",
"mbart": "hf-internal-testing/tiny-random-mbart",
"minicpm": "katuni4ka/tiny-random-minicpm",
Expand All @@ -91,6 +93,7 @@
"mistral": "echarlaix/tiny-random-mistral",
"mistral-nemo": "katuni4ka/tiny-random-mistral-nemo",
"mixtral": "TitanML/tiny-mixtral",
"mixtral_awq": "TitanML/tiny-mixtral-AWQ-4bit",
"mobilebert": "hf-internal-testing/tiny-random-MobileBertModel",
"mobilenet_v1": "google/mobilenet_v1_0.75_192",
"mobilenet_v2": "hf-internal-testing/tiny-random-MobileNetV2Model",
Expand Down Expand Up @@ -218,3 +221,58 @@ def get_num_quantized_nodes(model):
if type_name == "nf4":
num_weight_nodes["nf4"] += 1
return num_fake_quantize, num_weight_nodes


@contextmanager
def mock_torch_cuda_is_available(to_patch):
original_is_available = torch.cuda.is_available
if to_patch:
torch.cuda.is_available = lambda: True
try:
yield
finally:
if to_patch:
torch.cuda.is_available = original_is_available


@contextmanager
def patch_awq_for_inference(to_patch):
orig_gemm_forward = None
if to_patch:
# patch GEMM module to allow inference without CUDA GPU
from awq.modules.linear.gemm import WQLinearMMFunction
from awq.utils.packing_utils import dequantize_gemm

def new_forward(
ctx,
x,
qweight,
qzeros,
scales,
w_bit=4,
group_size=128,
bias=None,
out_features=0,
):
ctx.out_features = out_features

out_shape = x.shape[:-1] + (out_features,)
x = x.to(torch.float16)

out = dequantize_gemm(qweight, qzeros, scales, w_bit, group_size)
out = torch.matmul(x, out)

out = out + bias if bias is not None else out
out = out.reshape(out_shape)

if len(out.shape) == 2:
out = out.unsqueeze(0)
return out

orig_gemm_forward = WQLinearMMFunction.forward
WQLinearMMFunction.forward = new_forward
try:
yield
finally:
if orig_gemm_forward is not None:
WQLinearMMFunction.forward = orig_gemm_forward