From c53bafe5d4e056a6a2d1fb8134a96d86adadbd94 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cheen=20Hau=2C=20=E4=BF=8A=E8=B1=AA?= <33478814+chtanch@users.noreply.github.com> Date: Fri, 29 Mar 2024 10:25:19 +0800 Subject: [PATCH 1/5] Add bigdl-llm loader to bigdl-upstream (#17) * Add bigdl-llm loader * Add BigDL-LLM if-else fork for encode * Remove 'cpu-embedding' parameter * Migrate source code to ipex-llm * Add command-line flags for ipex-llm in readme Edit description of options in GUI --- README.md | 8 ++++++++ modules/loaders.py | 8 ++++++++ modules/models.py | 30 ++++++++++++++++++++++++++++++ modules/shared.py | 13 +++++++++++++ modules/text_generation.py | 2 ++ modules/ui.py | 4 ++++ modules/ui_model_menu.py | 14 +++++++++++++- 7 files changed, 78 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 686acac11c..50eb03e88b 100644 --- a/README.md +++ b/README.md @@ -315,6 +315,14 @@ List of command-line flags |-------------|-------------| | `--hqq-backend` | Backend for the HQQ loader. Valid options: PYTORCH, PYTORCH_COMPILE, ATEN. | +#### IPEX-LLM + +| Flag | Description | +|---------------------------------------|-------------| +| `--load-in-4bit` | Load the model with symmetric int4 precision. This option is mutually exclusive with `--load-in-low-bit`. | +| `--load-in-low-bit PRECISION` | Load the model with specified precision. Supported options are sym_int4, asym_int4, sym_int5, asym_int5, sym_int8, nf3, nf4, fp4, fp8, fp8_e4m3, fp8_e5m2, fp16, and bf16. This option is mutually exclusive with `--load-in-4bit`.| + + #### DeepSpeed | Flag | Description | diff --git a/modules/loaders.py b/modules/loaders.py index 513fd910cf..898870a550 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -151,6 +151,13 @@ 'hqq_backend', 'trust_remote_code', 'no_use_fast', + ], + 'IPEX-LLM': [ + 'ipex_llm_load_in_4bit', + 'ipex_llm_load_in_low_bit', + 'optimize_model', + 'trust_remote_code', + 'use_cache', ] }) @@ -210,6 +217,7 @@ def transformers_samplers(): 'AutoAWQ': transformers_samplers(), 'QuIP#': transformers_samplers(), 'HQQ': transformers_samplers(), + 'IPEX-LLM': transformers_samplers(), 'ExLlamav2': { 'temperature', 'temperature_last', diff --git a/modules/models.py b/modules/models.py index 605680630c..48fc9faaa2 100644 --- a/modules/models.py +++ b/modules/models.py @@ -71,6 +71,7 @@ def load_model(model_name, loader=None): 'AutoAWQ': AutoAWQ_loader, 'QuIP#': QuipSharp_loader, 'HQQ': HQQ_loader, + 'IPEX-LLM': ipex_llm_loader, } metadata = get_model_metadata(model_name) @@ -391,6 +392,35 @@ def HQQ_loader(model_name): HQQLinear.set_backend(getattr(HQQBackend, shared.args.hqq_backend)) return model +def ipex_llm_loader(model_name): + + from ipex_llm.transformers import AutoModelForCausalLM, AutoModel, AutoModelForSeq2SeqLM + + path_to_model = Path(f'{shared.args.model_dir}/{model_name}') + + config = AutoConfig.from_pretrained(path_to_model, trust_remote_code=shared.args.trust_remote_code) + + if 'chatglm' in model_name.lower(): + LoaderClass = AutoModel + else: + if config.to_dict().get('is_encoder_decoder', False): + LoaderClass = AutoModelForSeq2SeqLM + shared.is_seq2seq = True + else: + LoaderClass = AutoModelForCausalLM + + model = LoaderClass.from_pretrained( + path_to_model, + load_in_4bit=shared.args.load_in_4bit, + load_in_low_bit=shared.args.load_in_low_bit, + optimize_model=shared.args.optimize_model, + trust_remote_code=shared.args.trust_remote_code, + use_cache=shared.args.use_cache, + ) + + tokenizer = AutoTokenizer.from_pretrained(path_to_model, trust_remote_code=shared.args.trust_remote_code) + + return model, tokenizer def get_max_memory_dict(): max_memory = {} diff --git a/modules/shared.py b/modules/shared.py index 7bef04bf49..3b7a74a2db 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -164,6 +164,17 @@ group = parser.add_argument_group('HQQ') group.add_argument('--hqq-backend', type=str, default='PYTORCH_COMPILE', help='Backend for the HQQ loader. Valid options: PYTORCH, PYTORCH_COMPILE, ATEN.') +# IPEX-LLM +group = parser.add_argument_group('IPEX-LLM') +group.add_argument('--load-in-4bit', action='store_true', default=False, help='boolean value, True means loading linear’s weight to symmetric int 4 if'\ + 'the model is a regular fp16/bf16/fp32 model, and to asymmetric int 4 if the model is GPTQ model.Default to be False') +group.add_argument('--load-in-low-bit', type=str, default=None, help='str value, options are sym_int4, asym_int4, sym_int5, asym_int5'\ + ', sym_int8, nf3, nf4, fp4, fp8, fp8_e4m3, fp8_e5m2, fp16 or bf16. sym_int4 means symmetric int 4, asym_int4 means asymmetric int 4,'\ + 'nf4 means 4-bit NormalFloat, etc. Relevant low bit optimizations will be applied to the model.') +group.add_argument('--optimize-model', action='store_true', default=True, help='boolean value, Whether to further optimize the low_bit llm model.') +group.add_argument('--use-cache', action='store_true', default=True, help='If use_cache is True, past key values are used to speed up decoding if applicable to model.') +group.add_argument('--trust-remote-code', action='store_true', default=True, help='Set trust_remote_code=True while loading the model. Necessary for some models.') + # DeepSpeed group = parser.add_argument_group('DeepSpeed') group.add_argument('--deepspeed', action='store_true', help='Enable the use of DeepSpeed ZeRO-3 for inference via the Transformers integration.') @@ -264,6 +275,8 @@ def fix_loader_name(name): return 'QuIP#' elif name in ['hqq']: return 'HQQ' + elif name in ['IPEX-LLM', 'ipex-llm']: + return 'IPEX-LLM' def add_extension(name, last=False): diff --git a/modules/text_generation.py b/modules/text_generation.py index 227d1822d1..48dce0d4c4 100644 --- a/modules/text_generation.py +++ b/modules/text_generation.py @@ -139,6 +139,8 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt return input_ids elif shared.args.deepspeed: return input_ids.to(device=local_rank) + elif shared.args.loader == 'IPEX-LLM': + return input_ids elif torch.backends.mps.is_available(): device = torch.device('mps') return input_ids.to(device) diff --git a/modules/ui.py b/modules/ui.py index 6249bb481a..b6e1b9fef0 100644 --- a/modules/ui.py +++ b/modules/ui.py @@ -97,6 +97,10 @@ def list_model_elements(): 'row_split', 'tensorcores', 'hqq_backend', + 'ipex_llm_load_in_4bit', + 'ipex_llm_load_in_low_bit', + 'optimize_model', + 'use_cache', ] if is_torch_xpu_available(): for i in range(torch.xpu.device_count()): diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index ac6a8a8f5b..fc7c79506a 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -109,6 +109,16 @@ def create_ui(): shared.gradio['autogptq_info'] = gr.Markdown('ExLlamav2_HF is recommended over AutoGPTQ for models derived from Llama.') shared.gradio['quipsharp_info'] = gr.Markdown('QuIP# has to be installed manually at the moment.') + shared.gradio['ipex_llm_load_in_4bit'] = gr.Checkbox(label="load-in-4bit", + value=shared.args.load_in_4bit, + info="Load the model with symmetric int4 precision.\n\nTo enable this option, start the web UI with the --load-in-4bit flag.", + interactive=shared.args.load_in_4bit) + shared.gradio['ipex_llm_load_in_low_bit'] = gr.Dropdown(label="load-in-low-bit", + choices=["sym_int4", "asym_int4", "sym_int5", "asym_int5", "sym_int8", + "nf3", "nf4", "fp4", "fp8", "fp8_e4m3", "fp8_e5m2", "fp16", "bf16"], + value=shared.args.load_in_low_bit, + info='Load the model with specified precision.\n\nTo enable this option, start the web UI with the --load-in-low-bit flag.', + interactive=shared.args.load_in_4bit is False) with gr.Column(): shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit) @@ -146,7 +156,9 @@ def create_ui(): shared.gradio['gptq_for_llama_info'] = gr.Markdown('Legacy loader for compatibility with older GPUs. ExLlamav2_HF or AutoGPTQ are preferred for GPTQ models when supported.') shared.gradio['exllamav2_info'] = gr.Markdown("ExLlamav2_HF is recommended over ExLlamav2 for better integration with extensions and more consistent sampling behavior across loaders.") shared.gradio['llamacpp_HF_info'] = gr.Markdown("llamacpp_HF loads llama.cpp as a Transformers model. To use it, you need to place your GGUF in a subfolder of models/ with the necessary tokenizer files.\n\nYou can use the \"llamacpp_HF creator\" menu to do that automatically.") - + shared.gradio['optimize_model'] = gr.Checkbox(label="optimize-model", value=shared.args.optimize_model, info="Enable this option to further optimize the low-bit llm model.") + shared.gradio['use_cache'] = gr.Checkbox(label="use-cache", value=shared.args.use_cache, info="Wether to use past_key_values to speed up model decoding.") + with gr.Column(): with gr.Row(): shared.gradio['autoload_model'] = gr.Checkbox(value=shared.settings['autoload_model'], label='Autoload the model', info='Whether to load the model as soon as it is selected in the Model dropdown.', interactive=not mu) From d017a8fd3f0f283a317a4151250b4227fcc8ec7f Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Mon, 8 Apr 2024 18:43:25 +0800 Subject: [PATCH 2/5] Update style for upstream requests (#27) * update style * minor * fix * revert --- README.md | 7 ++++--- modules/loaders.py | 1 - modules/models.py | 15 ++++++++------- modules/shared.py | 12 ++++-------- modules/ui.py | 3 +-- modules/ui_model_menu.py | 16 +++------------- 6 files changed, 20 insertions(+), 34 deletions(-) diff --git a/README.md b/README.md index 50eb03e88b..bb59733acf 100644 --- a/README.md +++ b/README.md @@ -319,9 +319,10 @@ List of command-line flags | Flag | Description | |---------------------------------------|-------------| -| `--load-in-4bit` | Load the model with symmetric int4 precision. This option is mutually exclusive with `--load-in-low-bit`. | -| `--load-in-low-bit PRECISION` | Load the model with specified precision. Supported options are sym_int4, asym_int4, sym_int5, asym_int5, sym_int8, nf3, nf4, fp4, fp8, fp8_e4m3, fp8_e5m2, fp16, and bf16. This option is mutually exclusive with `--load-in-4bit`.| - +| `--load-in-4bit` | Load the model to symmetric int4 precision. This option is mutually exclusive with `--load-in-low-bit`. | +| `--load-in-low-bit PRECISION` | Load the model to the specified low-bit precision. Supported options are `sym_int4`, `fp4`, `fp8`, `asym_int4`, `sym_int5`, `asym_int5`, `sym_int8`, `mixed_fp4`, `mixed_fp8`, `nf3`, `nf4`, `fp8_e4m3`, `fp16` and `bf16`. asym_int4 means asymmetric int4, fp8 means 8-bit floating point, mixed_fp8 means mixture of 8-bit quantization, nf4 means 4-bit NormalFloat, etc. This option is mutually exclusive with `--load-in-4bit`.| +| `--optimize-model` | Further optimize the low-bit model with ipex-llm. +| `--trust-remote-code` | Set `trust_remote_code=True` while loading the model. Necessary for some models. | #### DeepSpeed diff --git a/modules/loaders.py b/modules/loaders.py index 898870a550..f0f6522851 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -157,7 +157,6 @@ 'ipex_llm_load_in_low_bit', 'optimize_model', 'trust_remote_code', - 'use_cache', ] }) diff --git a/modules/models.py b/modules/models.py index 48fc9faaa2..1d02973462 100644 --- a/modules/models.py +++ b/modules/models.py @@ -392,6 +392,7 @@ def HQQ_loader(model_name): HQQLinear.set_backend(getattr(HQQBackend, shared.args.hqq_backend)) return model + def ipex_llm_loader(model_name): from ipex_llm.transformers import AutoModelForCausalLM, AutoModel, AutoModelForSeq2SeqLM @@ -410,18 +411,18 @@ def ipex_llm_loader(model_name): LoaderClass = AutoModelForCausalLM model = LoaderClass.from_pretrained( - path_to_model, - load_in_4bit=shared.args.load_in_4bit, - load_in_low_bit=shared.args.load_in_low_bit, - optimize_model=shared.args.optimize_model, - trust_remote_code=shared.args.trust_remote_code, - use_cache=shared.args.use_cache, - ) + path_to_model, + load_in_4bit=shared.args.load_in_4bit, + load_in_low_bit=shared.args.load_in_low_bit, + optimize_model=shared.args.optimize_model, + trust_remote_code=shared.args.trust_remote_code, + use_cache=True) tokenizer = AutoTokenizer.from_pretrained(path_to_model, trust_remote_code=shared.args.trust_remote_code) return model, tokenizer + def get_max_memory_dict(): max_memory = {} max_cpu_memory = shared.args.cpu_memory.strip() if shared.args.cpu_memory is not None else '99GiB' diff --git a/modules/shared.py b/modules/shared.py index 3b7a74a2db..63dff02079 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -166,14 +166,10 @@ # IPEX-LLM group = parser.add_argument_group('IPEX-LLM') -group.add_argument('--load-in-4bit', action='store_true', default=False, help='boolean value, True means loading linear’s weight to symmetric int 4 if'\ - 'the model is a regular fp16/bf16/fp32 model, and to asymmetric int 4 if the model is GPTQ model.Default to be False') -group.add_argument('--load-in-low-bit', type=str, default=None, help='str value, options are sym_int4, asym_int4, sym_int5, asym_int5'\ - ', sym_int8, nf3, nf4, fp4, fp8, fp8_e4m3, fp8_e5m2, fp16 or bf16. sym_int4 means symmetric int 4, asym_int4 means asymmetric int 4,'\ - 'nf4 means 4-bit NormalFloat, etc. Relevant low bit optimizations will be applied to the model.') -group.add_argument('--optimize-model', action='store_true', default=True, help='boolean value, Whether to further optimize the low_bit llm model.') -group.add_argument('--use-cache', action='store_true', default=True, help='If use_cache is True, past key values are used to speed up decoding if applicable to model.') -group.add_argument('--trust-remote-code', action='store_true', default=True, help='Set trust_remote_code=True while loading the model. Necessary for some models.') +group.add_argument('--load-in-4bit', action='store_true', help='Load the model to symmetric int4 precision if it is a regular fp16/bf16/fp32 model, and to asymmetric int4 precision if it is GPTQ model.') +group.add_argument('--load-in-low-bit', type=str, default=None, help='Load the model to the specified low-bit precision. Supported options are sym_int4, fp4, fp8, asym_int4, sym_int5, asym_int5, sym_int8, mixed_fp4, mixed_fp8, nf3, nf4, fp8_e4m3, fp16 or bf16. asym_int4 means asymmetric int4, fp8 means 8-bit floating point, mixed_fp8 means mixture of 8-bit quantization, nf4 means 4-bit NormalFloat, etc.') +group.add_argument('--optimize-model', action='store_true', help='Further optimize the low-bit model with ipex-llm.') +group.add_argument('--trust-remote-code', action='store_true', help='Set trust_remote_code=True while loading the model. Necessary for some models.') # DeepSpeed group = parser.add_argument_group('DeepSpeed') diff --git a/modules/ui.py b/modules/ui.py index b6e1b9fef0..155f5b3cac 100644 --- a/modules/ui.py +++ b/modules/ui.py @@ -99,8 +99,7 @@ def list_model_elements(): 'hqq_backend', 'ipex_llm_load_in_4bit', 'ipex_llm_load_in_low_bit', - 'optimize_model', - 'use_cache', + 'optimize_model' ] if is_torch_xpu_available(): for i in range(torch.xpu.device_count()): diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index fc7c79506a..620297ce92 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -109,16 +109,8 @@ def create_ui(): shared.gradio['autogptq_info'] = gr.Markdown('ExLlamav2_HF is recommended over AutoGPTQ for models derived from Llama.') shared.gradio['quipsharp_info'] = gr.Markdown('QuIP# has to be installed manually at the moment.') - shared.gradio['ipex_llm_load_in_4bit'] = gr.Checkbox(label="load-in-4bit", - value=shared.args.load_in_4bit, - info="Load the model with symmetric int4 precision.\n\nTo enable this option, start the web UI with the --load-in-4bit flag.", - interactive=shared.args.load_in_4bit) - shared.gradio['ipex_llm_load_in_low_bit'] = gr.Dropdown(label="load-in-low-bit", - choices=["sym_int4", "asym_int4", "sym_int5", "asym_int5", "sym_int8", - "nf3", "nf4", "fp4", "fp8", "fp8_e4m3", "fp8_e5m2", "fp16", "bf16"], - value=shared.args.load_in_low_bit, - info='Load the model with specified precision.\n\nTo enable this option, start the web UI with the --load-in-low-bit flag.', - interactive=shared.args.load_in_4bit is False) + shared.gradio['ipex_llm_load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit, info="Load the model to symmetric int4 precision.\n\nTo enable this option, start the web UI with the --load-in-4bit flag.", interactive=shared.args.load_in_4bit) + shared.gradio['ipex_llm_load_in_low_bit'] = gr.Dropdown(label="load-in-low-bit", choices=["sym_int4", "fp4", "fp8", "asym_int4", "sym_int5", "asym_int5", "sym_int8", "nf3", "nf4", "fp8_e4m3", "fp16", "bf16"], value=shared.args.load_in_low_bit, info='Load the model to the specified low-bit precision.\n\nTo enable this option, start the web UI with the argument --load-in-low-bit PRECISION.', interactive=not shared.args.load_in_4bit) with gr.Column(): shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit) @@ -146,6 +138,7 @@ def create_ui(): shared.gradio['no_flash_attn'] = gr.Checkbox(label="no_flash_attn", value=shared.args.no_flash_attn, info='Force flash-attention to not be used.') shared.gradio['cfg_cache'] = gr.Checkbox(label="cfg-cache", value=shared.args.cfg_cache, info='Necessary to use CFG with this loader.') shared.gradio['num_experts_per_token'] = gr.Number(label="Number of experts per token", value=shared.args.num_experts_per_token, info='Only applies to MoE models like Mixtral.') + shared.gradio['optimize_model'] = gr.Checkbox(label="optimize-model", value=shared.args.optimize_model, info="Further optimize the low-bit model with ipex-llm.") with gr.Blocks(): shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Set trust_remote_code=True while loading the tokenizer/model. To enable this option, start the web UI with the --trust-remote-code flag.', interactive=shared.args.trust_remote_code) shared.gradio['no_use_fast'] = gr.Checkbox(label="no_use_fast", value=shared.args.no_use_fast, info='Set use_fast=False while loading the tokenizer.') @@ -156,9 +149,6 @@ def create_ui(): shared.gradio['gptq_for_llama_info'] = gr.Markdown('Legacy loader for compatibility with older GPUs. ExLlamav2_HF or AutoGPTQ are preferred for GPTQ models when supported.') shared.gradio['exllamav2_info'] = gr.Markdown("ExLlamav2_HF is recommended over ExLlamav2 for better integration with extensions and more consistent sampling behavior across loaders.") shared.gradio['llamacpp_HF_info'] = gr.Markdown("llamacpp_HF loads llama.cpp as a Transformers model. To use it, you need to place your GGUF in a subfolder of models/ with the necessary tokenizer files.\n\nYou can use the \"llamacpp_HF creator\" menu to do that automatically.") - shared.gradio['optimize_model'] = gr.Checkbox(label="optimize-model", value=shared.args.optimize_model, info="Enable this option to further optimize the low-bit llm model.") - shared.gradio['use_cache'] = gr.Checkbox(label="use-cache", value=shared.args.use_cache, info="Wether to use past_key_values to speed up model decoding.") - with gr.Column(): with gr.Row(): shared.gradio['autoload_model'] = gr.Checkbox(value=shared.settings['autoload_model'], label='Autoload the model', info='Whether to load the model as soon as it is selected in the Model dropdown.', interactive=not mu) From 58f4be09471379ead4a493d2dc4b03e5c8ffca17 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Mon, 8 Apr 2024 19:52:44 +0800 Subject: [PATCH 3/5] Fix load_in_4bit and load_in_low_bit not taking effect in UI (#28) --- modules/loaders.py | 6 +++--- modules/shared.py | 5 +++-- modules/ui.py | 3 +-- modules/ui_model_menu.py | 7 +++---- 4 files changed, 10 insertions(+), 11 deletions(-) diff --git a/modules/loaders.py b/modules/loaders.py index f0f6522851..5b49fb580b 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -153,10 +153,10 @@ 'no_use_fast', ], 'IPEX-LLM': [ - 'ipex_llm_load_in_4bit', - 'ipex_llm_load_in_low_bit', + 'load_in_4bit', + 'load_in_low_bit', 'optimize_model', - 'trust_remote_code', + 'trust_remote_code' ] }) diff --git a/modules/shared.py b/modules/shared.py index 63dff02079..13bc09236a 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -108,7 +108,7 @@ # bitsandbytes 4-bit group = parser.add_argument_group('bitsandbytes 4-bit') -group.add_argument('--load-in-4bit', action='store_true', help='Load the model with 4-bit precision (using bitsandbytes).') +group.add_argument('--load-in-4bit', action='store_true', help='Load the model with 4-bit precision (using bitsandbytes or ipex-llm).') group.add_argument('--use_double_quant', action='store_true', help='use_double_quant for 4-bit.') group.add_argument('--compute_dtype', type=str, default='float16', help='compute dtype for 4-bit. Valid options: bfloat16, float16, float32.') group.add_argument('--quant_type', type=str, default='nf4', help='quant_type for 4-bit. Valid options: nf4, fp4.') @@ -166,7 +166,8 @@ # IPEX-LLM group = parser.add_argument_group('IPEX-LLM') -group.add_argument('--load-in-4bit', action='store_true', help='Load the model to symmetric int4 precision if it is a regular fp16/bf16/fp32 model, and to asymmetric int4 precision if it is GPTQ model.') +# --load-in-4bit is the same as bitsandbytes 4-bit's argument +# group.add_argument('--load-in-4bit', action='store_true', help='Load the model to symmetric int4 precision if it is a regular fp16/bf16/fp32 model, and to asymmetric int4 precision if it is GPTQ model.') group.add_argument('--load-in-low-bit', type=str, default=None, help='Load the model to the specified low-bit precision. Supported options are sym_int4, fp4, fp8, asym_int4, sym_int5, asym_int5, sym_int8, mixed_fp4, mixed_fp8, nf3, nf4, fp8_e4m3, fp16 or bf16. asym_int4 means asymmetric int4, fp8 means 8-bit floating point, mixed_fp8 means mixture of 8-bit quantization, nf4 means 4-bit NormalFloat, etc.') group.add_argument('--optimize-model', action='store_true', help='Further optimize the low-bit model with ipex-llm.') group.add_argument('--trust-remote-code', action='store_true', help='Set trust_remote_code=True while loading the model. Necessary for some models.') diff --git a/modules/ui.py b/modules/ui.py index 155f5b3cac..420171e7e0 100644 --- a/modules/ui.py +++ b/modules/ui.py @@ -97,8 +97,7 @@ def list_model_elements(): 'row_split', 'tensorcores', 'hqq_backend', - 'ipex_llm_load_in_4bit', - 'ipex_llm_load_in_low_bit', + 'load_in_low_bit', 'optimize_model' ] if is_torch_xpu_available(): diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index 620297ce92..eee4886f7f 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -109,12 +109,12 @@ def create_ui(): shared.gradio['autogptq_info'] = gr.Markdown('ExLlamav2_HF is recommended over AutoGPTQ for models derived from Llama.') shared.gradio['quipsharp_info'] = gr.Markdown('QuIP# has to be installed manually at the moment.') - shared.gradio['ipex_llm_load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit, info="Load the model to symmetric int4 precision.\n\nTo enable this option, start the web UI with the --load-in-4bit flag.", interactive=shared.args.load_in_4bit) - shared.gradio['ipex_llm_load_in_low_bit'] = gr.Dropdown(label="load-in-low-bit", choices=["sym_int4", "fp4", "fp8", "asym_int4", "sym_int5", "asym_int5", "sym_int8", "nf3", "nf4", "fp8_e4m3", "fp16", "bf16"], value=shared.args.load_in_low_bit, info='Load the model to the specified low-bit precision.\n\nTo enable this option, start the web UI with the argument --load-in-low-bit PRECISION.', interactive=not shared.args.load_in_4bit) + shared.gradio['load_in_low_bit'] = gr.Dropdown(label="load-in-low-bit", choices=["sym_int4", "fp4", "fp8", "asym_int4", "sym_int5", "asym_int5", "sym_int8", "nf3", "nf4", "fp8_e4m3", "fp16", "bf16"], value=shared.args.load_in_low_bit, info='Load the model to the specified low-bit precision.', interactive=not shared.args.load_in_4bit) + shared.gradio['optimize_model'] = gr.Checkbox(label="optimize-model", value=shared.args.optimize_model, info="Further optimize the low-bit model with ipex-llm.") with gr.Column(): shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit) - shared.gradio['load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit) + shared.gradio['load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit, info="Load the model with 4-bit precision.", interactive=not shared.args.load_in_low_bit) shared.gradio['use_double_quant'] = gr.Checkbox(label="use_double_quant", value=shared.args.use_double_quant) shared.gradio['use_flash_attention_2'] = gr.Checkbox(label="use_flash_attention_2", value=shared.args.use_flash_attention_2, info='Set use_flash_attention_2=True while loading the model.') shared.gradio['auto_devices'] = gr.Checkbox(label="auto-devices", value=shared.args.auto_devices) @@ -138,7 +138,6 @@ def create_ui(): shared.gradio['no_flash_attn'] = gr.Checkbox(label="no_flash_attn", value=shared.args.no_flash_attn, info='Force flash-attention to not be used.') shared.gradio['cfg_cache'] = gr.Checkbox(label="cfg-cache", value=shared.args.cfg_cache, info='Necessary to use CFG with this loader.') shared.gradio['num_experts_per_token'] = gr.Number(label="Number of experts per token", value=shared.args.num_experts_per_token, info='Only applies to MoE models like Mixtral.') - shared.gradio['optimize_model'] = gr.Checkbox(label="optimize-model", value=shared.args.optimize_model, info="Further optimize the low-bit model with ipex-llm.") with gr.Blocks(): shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Set trust_remote_code=True while loading the tokenizer/model. To enable this option, start the web UI with the --trust-remote-code flag.', interactive=shared.args.trust_remote_code) shared.gradio['no_use_fast'] = gr.Checkbox(label="no_use_fast", value=shared.args.no_use_fast, info='Set use_fast=False while loading the tokenizer.') From 716829040e70696fa79c70a0fc38832c1a27dcd2 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Mon, 8 Apr 2024 20:15:34 +0800 Subject: [PATCH 4/5] Add dependency to requirements (#32) --- requirements.txt | 2 ++ requirements_cpu_only.txt | 2 ++ requirements_cpu_only_noavx2.txt | 2 ++ requirements_noavx2.txt | 2 ++ requirements_nowheels.txt | 2 ++ 5 files changed, 10 insertions(+) diff --git a/requirements.txt b/requirements.txt index abbd1a62ec..ad8e522cb5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -22,6 +22,8 @@ tensorboard transformers==4.38.* tqdm wandb +py-cpuinfo +ipex-llm # bitsandbytes bitsandbytes==0.42.*; platform_system != "Windows" diff --git a/requirements_cpu_only.txt b/requirements_cpu_only.txt index 19286bf10c..6b0af10997 100644 --- a/requirements_cpu_only.txt +++ b/requirements_cpu_only.txt @@ -22,6 +22,8 @@ tensorboard transformers==4.38.* tqdm wandb +py-cpuinfo +ipex-llm # llama-cpp-python (CPU only, AVX2) https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.52+cpuavx2-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" diff --git a/requirements_cpu_only_noavx2.txt b/requirements_cpu_only_noavx2.txt index a71e4a7c2f..c0a6d4a2b7 100644 --- a/requirements_cpu_only_noavx2.txt +++ b/requirements_cpu_only_noavx2.txt @@ -22,6 +22,8 @@ tensorboard transformers==4.38.* tqdm wandb +py-cpuinfo +ipex-llm # llama-cpp-python (CPU only, no AVX2) https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.52+cpuavx-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" diff --git a/requirements_noavx2.txt b/requirements_noavx2.txt index 8244aa5f91..12cfd72e02 100644 --- a/requirements_noavx2.txt +++ b/requirements_noavx2.txt @@ -22,6 +22,8 @@ tensorboard transformers==4.38.* tqdm wandb +py-cpuinfo +ipex-llm # bitsandbytes bitsandbytes==0.42.*; platform_system != "Windows" diff --git a/requirements_nowheels.txt b/requirements_nowheels.txt index 4dc697a69e..3ace31a1a1 100644 --- a/requirements_nowheels.txt +++ b/requirements_nowheels.txt @@ -22,3 +22,5 @@ tensorboard transformers==4.38.* tqdm wandb +py-cpuinfo +ipex-llm From 1608c01dd08d3107d640246c455c2c30f9a9cf31 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Tue, 9 Apr 2024 15:42:09 +0800 Subject: [PATCH 5/5] Remove some arguments (#35) --- README.md | 4 +--- modules/loaders.py | 4 +--- modules/models.py | 3 +-- modules/shared.py | 8 ++------ modules/ui.py | 2 -- modules/ui_model_menu.py | 5 ++--- 6 files changed, 7 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index 670efa51bd..b9eec74682 100644 --- a/README.md +++ b/README.md @@ -318,9 +318,7 @@ List of command-line flags | Flag | Description | |---------------------------------------|-------------| -| `--load-in-4bit` | Load the model to symmetric int4 precision. This option is mutually exclusive with `--load-in-low-bit`. | -| `--load-in-low-bit PRECISION` | Load the model to the specified low-bit precision. Supported options are `sym_int4`, `fp4`, `fp8`, `asym_int4`, `sym_int5`, `asym_int5`, `sym_int8`, `mixed_fp4`, `mixed_fp8`, `nf3`, `nf4`, `fp8_e4m3`, `fp16` and `bf16`. asym_int4 means asymmetric int4, fp8 means 8-bit floating point, mixed_fp8 means mixture of 8-bit quantization, nf4 means 4-bit NormalFloat, etc. This option is mutually exclusive with `--load-in-4bit`.| -| `--optimize-model` | Further optimize the low-bit model with ipex-llm. +| `--load-in-4bit` | Load the model to symmetric int4 precision with ipex-llm optimizations. | | `--trust-remote-code` | Set `trust_remote_code=True` while loading the model. Necessary for some models. | #### DeepSpeed diff --git a/modules/loaders.py b/modules/loaders.py index daf2ed7f16..401d24994d 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -151,9 +151,7 @@ ], 'IPEX-LLM': [ 'load_in_4bit', - 'load_in_low_bit', - 'optimize_model', - 'trust_remote_code' + 'trust_remote_code', ] }) diff --git a/modules/models.py b/modules/models.py index b75abb9e06..c02a255744 100644 --- a/modules/models.py +++ b/modules/models.py @@ -397,8 +397,7 @@ def ipex_llm_loader(model_name): model = LoaderClass.from_pretrained( path_to_model, load_in_4bit=shared.args.load_in_4bit, - load_in_low_bit=shared.args.load_in_low_bit, - optimize_model=shared.args.optimize_model, + optimize_model=True, trust_remote_code=shared.args.trust_remote_code, use_cache=True) diff --git a/modules/shared.py b/modules/shared.py index fdda3ce332..e2c12e3b88 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -166,12 +166,8 @@ group.add_argument('--hqq-backend', type=str, default='PYTORCH_COMPILE', help='Backend for the HQQ loader. Valid options: PYTORCH, PYTORCH_COMPILE, ATEN.') # IPEX-LLM -group = parser.add_argument_group('IPEX-LLM') -# --load-in-4bit is the same as bitsandbytes 4-bit's argument -# group.add_argument('--load-in-4bit', action='store_true', help='Load the model to symmetric int4 precision if it is a regular fp16/bf16/fp32 model, and to asymmetric int4 precision if it is GPTQ model.') -group.add_argument('--load-in-low-bit', type=str, default=None, help='Load the model to the specified low-bit precision. Supported options are sym_int4, fp4, fp8, asym_int4, sym_int5, asym_int5, sym_int8, mixed_fp4, mixed_fp8, nf3, nf4, fp8_e4m3, fp16 or bf16. asym_int4 means asymmetric int4, fp8 means 8-bit floating point, mixed_fp8 means mixture of 8-bit quantization, nf4 means 4-bit NormalFloat, etc.') -group.add_argument('--optimize-model', action='store_true', help='Further optimize the low-bit model with ipex-llm.') -group.add_argument('--trust-remote-code', action='store_true', help='Set trust_remote_code=True while loading the model. Necessary for some models.') +# --load-in-4bit is the same as bitsandbytes 4-bit +# --trust-remote-code is the same as Transformers # DeepSpeed group = parser.add_argument_group('DeepSpeed') diff --git a/modules/ui.py b/modules/ui.py index 00095f3f20..56b1518c7a 100644 --- a/modules/ui.py +++ b/modules/ui.py @@ -101,8 +101,6 @@ def list_model_elements(): 'streaming_llm', 'attention_sink_size', 'hqq_backend', - 'load_in_low_bit', - 'optimize_model' ] if is_torch_xpu_available(): for i in range(torch.xpu.device_count()): diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index a736264eb2..52636de296 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -109,12 +109,10 @@ def create_ui(): shared.gradio['autogptq_info'] = gr.Markdown('ExLlamav2_HF is recommended over AutoGPTQ for models derived from Llama.') shared.gradio['quipsharp_info'] = gr.Markdown('QuIP# has to be installed manually at the moment.') - shared.gradio['load_in_low_bit'] = gr.Dropdown(label="load-in-low-bit", choices=["sym_int4", "fp4", "fp8", "asym_int4", "sym_int5", "asym_int5", "sym_int8", "nf3", "nf4", "fp8_e4m3", "fp16", "bf16"], value=shared.args.load_in_low_bit, info='Load the model to the specified low-bit precision.', interactive=not shared.args.load_in_4bit) - shared.gradio['optimize_model'] = gr.Checkbox(label="optimize-model", value=shared.args.optimize_model, info="Further optimize the low-bit model with ipex-llm.") with gr.Column(): shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit) - shared.gradio['load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit, info="Load the model with 4-bit precision.", interactive=not shared.args.load_in_low_bit) + shared.gradio['load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit, info="Load the model with 4-bit precision.") shared.gradio['use_double_quant'] = gr.Checkbox(label="use_double_quant", value=shared.args.use_double_quant) shared.gradio['use_flash_attention_2'] = gr.Checkbox(label="use_flash_attention_2", value=shared.args.use_flash_attention_2, info='Set use_flash_attention_2=True while loading the model.') shared.gradio['auto_devices'] = gr.Checkbox(label="auto-devices", value=shared.args.auto_devices) @@ -151,6 +149,7 @@ def create_ui(): shared.gradio['gptq_for_llama_info'] = gr.Markdown('Legacy loader for compatibility with older GPUs. ExLlamav2_HF or AutoGPTQ are preferred for GPTQ models when supported.') shared.gradio['exllamav2_info'] = gr.Markdown("ExLlamav2_HF is recommended over ExLlamav2 for better integration with extensions and more consistent sampling behavior across loaders.") shared.gradio['llamacpp_HF_info'] = gr.Markdown("llamacpp_HF loads llama.cpp as a Transformers model. To use it, you need to place your GGUF in a subfolder of models/ with the necessary tokenizer files.\n\nYou can use the \"llamacpp_HF creator\" menu to do that automatically.") + with gr.Column(): with gr.Row(): shared.gradio['autoload_model'] = gr.Checkbox(value=shared.settings['autoload_model'], label='Autoload the model', info='Whether to load the model as soon as it is selected in the Model dropdown.', interactive=not mu)