Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

move gptq patching to export stage #465

Merged
merged 1 commit into from
Nov 2, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 41 additions & 1 deletion optimum/exporters/openvino/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from typing import Any, Callable, Dict, Optional, Union

from requests.exceptions import ConnectionError as RequestsConnectionError
from transformers import AutoTokenizer
from transformers import AutoConfig, AutoTokenizer

from optimum.exporters import TasksManager
from optimum.exporters.onnx import __main__ as optimum_main
Expand Down Expand Up @@ -137,6 +137,41 @@ def main_export(
original_task = task
task = TasksManager.map_from_synonym(task)

# Patch the modules to export of GPTQ models w/o GPU
do_gptq_patching = False
try:
config = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=trust_remote_code)
config_dict = config.to_dict()
quantization_config = config_dict.get("quantization_config", None)
do_gptq_patching = quantization_config and quantization_config["quant_method"] == "gptq"
except Exception:
pass

if do_gptq_patching:
import torch

torch.set_default_dtype(torch.float32)
orig_cuda_check = torch.cuda.is_available
torch.cuda.is_available = lambda: True

from optimum.gptq import GPTQQuantizer

orig_post_init_model = GPTQQuantizer.post_init_model

def post_init_model(self, model):
from auto_gptq import exllama_set_max_input_length

class StoreAttr(object):
pass

model.quantize_config = StoreAttr()
model.quantize_config.desc_act = self.desc_act
if self.desc_act and not self.disable_exllama and self.max_input_length is not None:
model = exllama_set_max_input_length(model, self.max_input_length)
return model

GPTQQuantizer.post_init_model = post_init_model

framework = TasksManager.determine_framework(model_name_or_path, subfolder=subfolder, framework=framework)

# get the shapes to be used to generate dummy inputs
Expand Down Expand Up @@ -324,3 +359,8 @@ def main_export(
int8=int8,
model_kwargs=model_kwargs,
)

# Unpatch modules after GPTQ export
if do_gptq_patching:
torch.cuda.is_available = orig_cuda_check
GPTQQuantizer.post_init_model = orig_post_init_model
35 changes: 1 addition & 34 deletions optimum/intel/openvino/modeling_decoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,34 +229,6 @@ def _from_transformers(
if use_cache:
task = task + "-with-past"

# Patch the modules to export of GPTQ models w/o GPU
do_gptq_patching = False
config_dict = config.to_dict()
quantization_config = config_dict.get("quantization_config", None)
do_gptq_patching = quantization_config and quantization_config["quant_method"] == "gptq"
if do_gptq_patching:
torch.set_default_dtype(torch.float32)
orig_cuda_check = torch.cuda.is_available
torch.cuda.is_available = lambda: True

from optimum.gptq import GPTQQuantizer

orig_post_init_model = GPTQQuantizer.post_init_model

def post_init_model(self, model):
from auto_gptq import exllama_set_max_input_length

class StoreAttr(object):
pass

model.quantize_config = StoreAttr()
model.quantize_config.desc_act = self.desc_act
if self.desc_act and not self.disable_exllama and self.max_input_length is not None:
model = exllama_set_max_input_length(model, self.max_input_length)
return model

GPTQQuantizer.post_init_model = post_init_model

main_export(
model_name_or_path=model_id,
output=save_dir_path,
Expand All @@ -271,11 +243,6 @@ class StoreAttr(object):
int8=load_in_8bit,
)

# Unpatch modules after GPTQ export
if do_gptq_patching:
torch.cuda.is_available = orig_cuda_check
GPTQQuantizer.post_init_model = orig_post_init_model

config.is_decoder = True
config.is_encoder_decoder = False
config.save_pretrained(save_dir_path)
Expand Down Expand Up @@ -504,7 +471,7 @@ def _from_pretrained(
elif model_type == "gpt-bigcode":
init_cls = OVGPTBigCodeForCausalLM
else:
init_cls = OVModelForCausalLM
init_cls = cls

return init_cls(model=model, config=config, model_save_dir=model_cache_path.parent, **kwargs)

Expand Down