diff --git a/README.md b/README.md index 9f3e81bdba..b9eec74682 100644 --- a/README.md +++ b/README.md @@ -314,6 +314,13 @@ List of command-line flags |-------------|-------------| | `--hqq-backend` | Backend for the HQQ loader. Valid options: PYTORCH, PYTORCH_COMPILE, ATEN. | +#### IPEX-LLM + +| Flag | Description | +|---------------------------------------|-------------| +| `--load-in-4bit` | Load the model to symmetric int4 precision with ipex-llm optimizations. | +| `--trust-remote-code` | Set `trust_remote_code=True` while loading the model. Necessary for some models. | + #### DeepSpeed | Flag | Description | diff --git a/modules/loaders.py b/modules/loaders.py index 234773397d..401d24994d 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -148,6 +148,10 @@ 'hqq_backend', 'trust_remote_code', 'no_use_fast', + ], + 'IPEX-LLM': [ + 'load_in_4bit', + 'trust_remote_code', ] }) @@ -203,6 +207,7 @@ def transformers_samplers(): 'AutoAWQ': transformers_samplers(), 'QuIP#': transformers_samplers(), 'HQQ': transformers_samplers(), + 'IPEX-LLM': transformers_samplers(), 'ExLlamav2': { 'temperature', 'temperature_last', diff --git a/modules/models.py b/modules/models.py index 1519fc8941..c02a255744 100644 --- a/modules/models.py +++ b/modules/models.py @@ -71,6 +71,7 @@ def load_model(model_name, loader=None): 'AutoAWQ': AutoAWQ_loader, 'QuIP#': QuipSharp_loader, 'HQQ': HQQ_loader, + 'IPEX-LLM': ipex_llm_loader, } metadata = get_model_metadata(model_name) @@ -376,6 +377,35 @@ def HQQ_loader(model_name): return model +def ipex_llm_loader(model_name): + + from ipex_llm.transformers import AutoModelForCausalLM, AutoModel, AutoModelForSeq2SeqLM + + path_to_model = Path(f'{shared.args.model_dir}/{model_name}') + + config = AutoConfig.from_pretrained(path_to_model, trust_remote_code=shared.args.trust_remote_code) + + if 'chatglm' in model_name.lower(): + LoaderClass = AutoModel + else: + if config.to_dict().get('is_encoder_decoder', False): + LoaderClass = AutoModelForSeq2SeqLM + shared.is_seq2seq = True + else: + LoaderClass = AutoModelForCausalLM + + model = LoaderClass.from_pretrained( + path_to_model, + load_in_4bit=shared.args.load_in_4bit, + optimize_model=True, + trust_remote_code=shared.args.trust_remote_code, + use_cache=True) + + tokenizer = AutoTokenizer.from_pretrained(path_to_model, trust_remote_code=shared.args.trust_remote_code) + + return model, tokenizer + + def get_max_memory_dict(): max_memory = {} max_cpu_memory = shared.args.cpu_memory.strip() if shared.args.cpu_memory is not None else '99GiB' diff --git a/modules/shared.py b/modules/shared.py index 46b6ef478f..e2c12e3b88 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -106,7 +106,7 @@ # bitsandbytes 4-bit group = parser.add_argument_group('bitsandbytes 4-bit') -group.add_argument('--load-in-4bit', action='store_true', help='Load the model with 4-bit precision (using bitsandbytes).') +group.add_argument('--load-in-4bit', action='store_true', help='Load the model with 4-bit precision (using bitsandbytes or ipex-llm).') group.add_argument('--use_double_quant', action='store_true', help='use_double_quant for 4-bit.') group.add_argument('--compute_dtype', type=str, default='float16', help='compute dtype for 4-bit. Valid options: bfloat16, float16, float32.') group.add_argument('--quant_type', type=str, default='nf4', help='quant_type for 4-bit. Valid options: nf4, fp4.') @@ -165,6 +165,10 @@ group = parser.add_argument_group('HQQ') group.add_argument('--hqq-backend', type=str, default='PYTORCH_COMPILE', help='Backend for the HQQ loader. Valid options: PYTORCH, PYTORCH_COMPILE, ATEN.') +# IPEX-LLM +# --load-in-4bit is the same as bitsandbytes 4-bit +# --trust-remote-code is the same as Transformers + # DeepSpeed group = parser.add_argument_group('DeepSpeed') group.add_argument('--deepspeed', action='store_true', help='Enable the use of DeepSpeed ZeRO-3 for inference via the Transformers integration.') @@ -263,6 +267,8 @@ def fix_loader_name(name): return 'QuIP#' elif name in ['hqq']: return 'HQQ' + elif name in ['IPEX-LLM', 'ipex-llm']: + return 'IPEX-LLM' def add_extension(name, last=False): diff --git a/modules/text_generation.py b/modules/text_generation.py index f99c605e13..e3d3ceded3 100644 --- a/modules/text_generation.py +++ b/modules/text_generation.py @@ -132,6 +132,8 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt return input_ids elif shared.args.deepspeed: return input_ids.to(device=local_rank) + elif shared.args.loader == 'IPEX-LLM': + return input_ids elif torch.backends.mps.is_available(): device = torch.device('mps') return input_ids.to(device) diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index 8d6122d252..52636de296 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -112,7 +112,7 @@ def create_ui(): with gr.Column(): shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit) - shared.gradio['load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit) + shared.gradio['load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit, info="Load the model with 4-bit precision.") shared.gradio['use_double_quant'] = gr.Checkbox(label="use_double_quant", value=shared.args.use_double_quant) shared.gradio['use_flash_attention_2'] = gr.Checkbox(label="use_flash_attention_2", value=shared.args.use_flash_attention_2, info='Set use_flash_attention_2=True while loading the model.') shared.gradio['auto_devices'] = gr.Checkbox(label="auto-devices", value=shared.args.auto_devices) diff --git a/requirements.txt b/requirements.txt index 17f7796644..acaf5c044c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -26,6 +26,8 @@ tensorboard transformers==4.39.* tqdm wandb +py-cpuinfo +ipex-llm # API SpeechRecognition==3.10.0 diff --git a/requirements_cpu_only.txt b/requirements_cpu_only.txt index b6e2a944b4..5425f77da6 100644 --- a/requirements_cpu_only.txt +++ b/requirements_cpu_only.txt @@ -24,6 +24,8 @@ tensorboard transformers==4.39.* tqdm wandb +py-cpuinfo +ipex-llm # API SpeechRecognition==3.10.0 diff --git a/requirements_cpu_only_noavx2.txt b/requirements_cpu_only_noavx2.txt index 21c0f68832..361c9220fb 100644 --- a/requirements_cpu_only_noavx2.txt +++ b/requirements_cpu_only_noavx2.txt @@ -24,6 +24,8 @@ tensorboard transformers==4.39.* tqdm wandb +py-cpuinfo +ipex-llm # API SpeechRecognition==3.10.0 diff --git a/requirements_noavx2.txt b/requirements_noavx2.txt index fec052c3fd..d26080223d 100644 --- a/requirements_noavx2.txt +++ b/requirements_noavx2.txt @@ -26,6 +26,8 @@ tensorboard transformers==4.39.* tqdm wandb +py-cpuinfo +ipex-llm # API SpeechRecognition==3.10.0 diff --git a/requirements_nowheels.txt b/requirements_nowheels.txt index e85a19bd54..98439fc193 100644 --- a/requirements_nowheels.txt +++ b/requirements_nowheels.txt @@ -24,6 +24,8 @@ tensorboard transformers==4.39.* tqdm wandb +py-cpuinfo +ipex-llm # API SpeechRecognition==3.10.0