From c53bafe5d4e056a6a2d1fb8134a96d86adadbd94 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cheen=20Hau=2C=20=E4=BF=8A=E8=B1=AA?=
 <33478814+chtanch@users.noreply.github.com>
Date: Fri, 29 Mar 2024 10:25:19 +0800
Subject: [PATCH 1/5] Add bigdl-llm loader to bigdl-upstream (#17)

* Add bigdl-llm loader

* Add BigDL-LLM if-else fork for encode

* Remove 'cpu-embedding' parameter

* Migrate source code to ipex-llm

* Add command-line flags for ipex-llm in readme
Edit description of options in GUI
---
 README.md                  |  8 ++++++++
 modules/loaders.py         |  8 ++++++++
 modules/models.py          | 30 ++++++++++++++++++++++++++++++
 modules/shared.py          | 13 +++++++++++++
 modules/text_generation.py |  2 ++
 modules/ui.py              |  4 ++++
 modules/ui_model_menu.py   | 14 +++++++++++++-
 7 files changed, 78 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 686acac11c..50eb03e88b 100644
--- a/README.md
+++ b/README.md
@@ -315,6 +315,14 @@ List of command-line flags
 |-------------|-------------|
 | `--hqq-backend` | Backend for the HQQ loader. Valid options: PYTORCH, PYTORCH_COMPILE, ATEN. |
 
+#### IPEX-LLM
+
+| Flag                                  | Description |
+|---------------------------------------|-------------|
+| `--load-in-4bit`   | Load the model with symmetric int4 precision. This option is mutually exclusive with `--load-in-low-bit`. |
+| `--load-in-low-bit PRECISION` | Load the model with specified precision. Supported options are sym_int4, asym_int4, sym_int5, asym_int5, sym_int8, nf3, nf4, fp4, fp8, fp8_e4m3, fp8_e5m2, fp16, and bf16. This option is mutually exclusive with `--load-in-4bit`.|
+
+
 #### DeepSpeed
 
 | Flag                                  | Description |
diff --git a/modules/loaders.py b/modules/loaders.py
index 513fd910cf..898870a550 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -151,6 +151,13 @@
         'hqq_backend',
         'trust_remote_code',
         'no_use_fast',
+    ],
+    'IPEX-LLM': [
+        'ipex_llm_load_in_4bit',
+        'ipex_llm_load_in_low_bit',
+        'optimize_model',
+        'trust_remote_code',
+        'use_cache',
     ]
 })
 
@@ -210,6 +217,7 @@ def transformers_samplers():
     'AutoAWQ': transformers_samplers(),
     'QuIP#': transformers_samplers(),
     'HQQ': transformers_samplers(),
+    'IPEX-LLM': transformers_samplers(),
     'ExLlamav2': {
         'temperature',
         'temperature_last',
diff --git a/modules/models.py b/modules/models.py
index 605680630c..48fc9faaa2 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -71,6 +71,7 @@ def load_model(model_name, loader=None):
         'AutoAWQ': AutoAWQ_loader,
         'QuIP#': QuipSharp_loader,
         'HQQ': HQQ_loader,
+        'IPEX-LLM': ipex_llm_loader,
     }
 
     metadata = get_model_metadata(model_name)
@@ -391,6 +392,35 @@ def HQQ_loader(model_name):
     HQQLinear.set_backend(getattr(HQQBackend, shared.args.hqq_backend))
     return model
 
+def ipex_llm_loader(model_name):
+
+    from ipex_llm.transformers import AutoModelForCausalLM, AutoModel, AutoModelForSeq2SeqLM
+
+    path_to_model = Path(f'{shared.args.model_dir}/{model_name}')
+
+    config = AutoConfig.from_pretrained(path_to_model, trust_remote_code=shared.args.trust_remote_code)
+
+    if 'chatglm' in model_name.lower():
+        LoaderClass = AutoModel
+    else:
+        if config.to_dict().get('is_encoder_decoder', False):
+            LoaderClass = AutoModelForSeq2SeqLM
+            shared.is_seq2seq = True
+        else:
+            LoaderClass = AutoModelForCausalLM
+
+    model = LoaderClass.from_pretrained(
+                path_to_model,
+                load_in_4bit=shared.args.load_in_4bit,
+                load_in_low_bit=shared.args.load_in_low_bit,
+                optimize_model=shared.args.optimize_model,
+                trust_remote_code=shared.args.trust_remote_code,
+                use_cache=shared.args.use_cache,
+                )
+
+    tokenizer = AutoTokenizer.from_pretrained(path_to_model, trust_remote_code=shared.args.trust_remote_code)
+
+    return model, tokenizer
 
 def get_max_memory_dict():
     max_memory = {}
diff --git a/modules/shared.py b/modules/shared.py
index 7bef04bf49..3b7a74a2db 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -164,6 +164,17 @@
 group = parser.add_argument_group('HQQ')
 group.add_argument('--hqq-backend', type=str, default='PYTORCH_COMPILE', help='Backend for the HQQ loader. Valid options: PYTORCH, PYTORCH_COMPILE, ATEN.')
 
+# IPEX-LLM
+group = parser.add_argument_group('IPEX-LLM')
+group.add_argument('--load-in-4bit', action='store_true', default=False, help='boolean value, True means loading linear’s weight to symmetric int 4 if'\
+                   'the model is a regular fp16/bf16/fp32 model, and to asymmetric int 4 if the model is GPTQ model.Default to be False')
+group.add_argument('--load-in-low-bit', type=str, default=None, help='str value, options are sym_int4, asym_int4, sym_int5, asym_int5'\
+                   ', sym_int8, nf3, nf4, fp4, fp8, fp8_e4m3, fp8_e5m2, fp16 or bf16. sym_int4 means symmetric int 4, asym_int4 means asymmetric int 4,'\
+                   'nf4 means 4-bit NormalFloat, etc. Relevant low bit optimizations will be applied to the model.')
+group.add_argument('--optimize-model', action='store_true', default=True, help='boolean value, Whether to further optimize the low_bit llm model.')
+group.add_argument('--use-cache', action='store_true', default=True, help='If use_cache is True, past key values are used to speed up decoding if applicable to model.')
+group.add_argument('--trust-remote-code', action='store_true', default=True, help='Set trust_remote_code=True while loading the model. Necessary for some models.')
+
 # DeepSpeed
 group = parser.add_argument_group('DeepSpeed')
 group.add_argument('--deepspeed', action='store_true', help='Enable the use of DeepSpeed ZeRO-3 for inference via the Transformers integration.')
@@ -264,6 +275,8 @@ def fix_loader_name(name):
         return 'QuIP#'
     elif name in ['hqq']:
         return 'HQQ'
+    elif name in ['IPEX-LLM', 'ipex-llm']:
+        return 'IPEX-LLM'
 
 
 def add_extension(name, last=False):
diff --git a/modules/text_generation.py b/modules/text_generation.py
index 227d1822d1..48dce0d4c4 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -139,6 +139,8 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt
         return input_ids
     elif shared.args.deepspeed:
         return input_ids.to(device=local_rank)
+    elif shared.args.loader == 'IPEX-LLM':
+        return input_ids
     elif torch.backends.mps.is_available():
         device = torch.device('mps')
         return input_ids.to(device)
diff --git a/modules/ui.py b/modules/ui.py
index 6249bb481a..b6e1b9fef0 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -97,6 +97,10 @@ def list_model_elements():
         'row_split',
         'tensorcores',
         'hqq_backend',
+        'ipex_llm_load_in_4bit',
+        'ipex_llm_load_in_low_bit',
+        'optimize_model',
+        'use_cache',
     ]
     if is_torch_xpu_available():
         for i in range(torch.xpu.device_count()):
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index ac6a8a8f5b..fc7c79506a 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -109,6 +109,16 @@ def create_ui():
 
                             shared.gradio['autogptq_info'] = gr.Markdown('ExLlamav2_HF is recommended over AutoGPTQ for models derived from Llama.')
                             shared.gradio['quipsharp_info'] = gr.Markdown('QuIP# has to be installed manually at the moment.')
+                            shared.gradio['ipex_llm_load_in_4bit'] = gr.Checkbox(label="load-in-4bit",
+                                                                        value=shared.args.load_in_4bit,
+                                                                        info="Load the model with symmetric int4 precision.\n\nTo enable this option, start the web UI with the --load-in-4bit flag.",
+                                                                        interactive=shared.args.load_in_4bit)
+                            shared.gradio['ipex_llm_load_in_low_bit'] = gr.Dropdown(label="load-in-low-bit",
+                                                                           choices=["sym_int4", "asym_int4", "sym_int5", "asym_int5", "sym_int8",
+                                                                                    "nf3", "nf4", "fp4", "fp8", "fp8_e4m3", "fp8_e5m2", "fp16", "bf16"],
+                                                                           value=shared.args.load_in_low_bit,
+                                                                           info='Load the model with specified precision.\n\nTo enable this option, start the web UI with the --load-in-low-bit flag.',
+                                                                           interactive=shared.args.load_in_4bit is False)
 
                         with gr.Column():
                             shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit)
@@ -146,7 +156,9 @@ def create_ui():
                             shared.gradio['gptq_for_llama_info'] = gr.Markdown('Legacy loader for compatibility with older GPUs. ExLlamav2_HF or AutoGPTQ are preferred for GPTQ models when supported.')
                             shared.gradio['exllamav2_info'] = gr.Markdown("ExLlamav2_HF is recommended over ExLlamav2 for better integration with extensions and more consistent sampling behavior across loaders.")
                             shared.gradio['llamacpp_HF_info'] = gr.Markdown("llamacpp_HF loads llama.cpp as a Transformers model. To use it, you need to place your GGUF in a subfolder of models/ with the necessary tokenizer files.\n\nYou can use the \"llamacpp_HF creator\" menu to do that automatically.")
-
+                            shared.gradio['optimize_model'] = gr.Checkbox(label="optimize-model", value=shared.args.optimize_model, info="Enable this option to further optimize the low-bit llm model.")
+                            shared.gradio['use_cache'] = gr.Checkbox(label="use-cache", value=shared.args.use_cache, info="Wether to use past_key_values to speed up model decoding.")
+                            
             with gr.Column():
                 with gr.Row():
                     shared.gradio['autoload_model'] = gr.Checkbox(value=shared.settings['autoload_model'], label='Autoload the model', info='Whether to load the model as soon as it is selected in the Model dropdown.', interactive=not mu)

From d017a8fd3f0f283a317a4151250b4227fcc8ec7f Mon Sep 17 00:00:00 2001
From: Kai Huang <huangkaivision@gmail.com>
Date: Mon, 8 Apr 2024 18:43:25 +0800
Subject: [PATCH 2/5] Update style for upstream requests (#27)

* update style

* minor

* fix

* revert
---
 README.md                |  7 ++++---
 modules/loaders.py       |  1 -
 modules/models.py        | 15 ++++++++-------
 modules/shared.py        | 12 ++++--------
 modules/ui.py            |  3 +--
 modules/ui_model_menu.py | 16 +++-------------
 6 files changed, 20 insertions(+), 34 deletions(-)

diff --git a/README.md b/README.md
index 50eb03e88b..bb59733acf 100644
--- a/README.md
+++ b/README.md
@@ -319,9 +319,10 @@ List of command-line flags
 
 | Flag                                  | Description |
 |---------------------------------------|-------------|
-| `--load-in-4bit`   | Load the model with symmetric int4 precision. This option is mutually exclusive with `--load-in-low-bit`. |
-| `--load-in-low-bit PRECISION` | Load the model with specified precision. Supported options are sym_int4, asym_int4, sym_int5, asym_int5, sym_int8, nf3, nf4, fp4, fp8, fp8_e4m3, fp8_e5m2, fp16, and bf16. This option is mutually exclusive with `--load-in-4bit`.|
-
+| `--load-in-4bit`                      | Load the model to symmetric int4 precision. This option is mutually exclusive with `--load-in-low-bit`. |
+| `--load-in-low-bit PRECISION`         | Load the model to the specified low-bit precision. Supported options are `sym_int4`, `fp4`, `fp8`, `asym_int4`, `sym_int5`, `asym_int5`, `sym_int8`, `mixed_fp4`, `mixed_fp8`, `nf3`, `nf4`, `fp8_e4m3`, `fp16` and `bf16`. asym_int4 means asymmetric int4, fp8 means 8-bit floating point, mixed_fp8 means mixture of 8-bit quantization, nf4 means 4-bit NormalFloat, etc. This option is mutually exclusive with `--load-in-4bit`.|
+| `--optimize-model`                    | Further optimize the low-bit model with ipex-llm.
+| `--trust-remote-code`                 | Set `trust_remote_code=True` while loading the model. Necessary for some models. |
 
 #### DeepSpeed
 
diff --git a/modules/loaders.py b/modules/loaders.py
index 898870a550..f0f6522851 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -157,7 +157,6 @@
         'ipex_llm_load_in_low_bit',
         'optimize_model',
         'trust_remote_code',
-        'use_cache',
     ]
 })
 
diff --git a/modules/models.py b/modules/models.py
index 48fc9faaa2..1d02973462 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -392,6 +392,7 @@ def HQQ_loader(model_name):
     HQQLinear.set_backend(getattr(HQQBackend, shared.args.hqq_backend))
     return model
 
+
 def ipex_llm_loader(model_name):
 
     from ipex_llm.transformers import AutoModelForCausalLM, AutoModel, AutoModelForSeq2SeqLM
@@ -410,18 +411,18 @@ def ipex_llm_loader(model_name):
             LoaderClass = AutoModelForCausalLM
 
     model = LoaderClass.from_pretrained(
-                path_to_model,
-                load_in_4bit=shared.args.load_in_4bit,
-                load_in_low_bit=shared.args.load_in_low_bit,
-                optimize_model=shared.args.optimize_model,
-                trust_remote_code=shared.args.trust_remote_code,
-                use_cache=shared.args.use_cache,
-                )
+        path_to_model,
+        load_in_4bit=shared.args.load_in_4bit,
+        load_in_low_bit=shared.args.load_in_low_bit,
+        optimize_model=shared.args.optimize_model,
+        trust_remote_code=shared.args.trust_remote_code,
+        use_cache=True)
 
     tokenizer = AutoTokenizer.from_pretrained(path_to_model, trust_remote_code=shared.args.trust_remote_code)
 
     return model, tokenizer
 
+
 def get_max_memory_dict():
     max_memory = {}
     max_cpu_memory = shared.args.cpu_memory.strip() if shared.args.cpu_memory is not None else '99GiB'
diff --git a/modules/shared.py b/modules/shared.py
index 3b7a74a2db..63dff02079 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -166,14 +166,10 @@
 
 # IPEX-LLM
 group = parser.add_argument_group('IPEX-LLM')
-group.add_argument('--load-in-4bit', action='store_true', default=False, help='boolean value, True means loading linear’s weight to symmetric int 4 if'\
-                   'the model is a regular fp16/bf16/fp32 model, and to asymmetric int 4 if the model is GPTQ model.Default to be False')
-group.add_argument('--load-in-low-bit', type=str, default=None, help='str value, options are sym_int4, asym_int4, sym_int5, asym_int5'\
-                   ', sym_int8, nf3, nf4, fp4, fp8, fp8_e4m3, fp8_e5m2, fp16 or bf16. sym_int4 means symmetric int 4, asym_int4 means asymmetric int 4,'\
-                   'nf4 means 4-bit NormalFloat, etc. Relevant low bit optimizations will be applied to the model.')
-group.add_argument('--optimize-model', action='store_true', default=True, help='boolean value, Whether to further optimize the low_bit llm model.')
-group.add_argument('--use-cache', action='store_true', default=True, help='If use_cache is True, past key values are used to speed up decoding if applicable to model.')
-group.add_argument('--trust-remote-code', action='store_true', default=True, help='Set trust_remote_code=True while loading the model. Necessary for some models.')
+group.add_argument('--load-in-4bit', action='store_true', help='Load the model to symmetric int4 precision if it is a regular fp16/bf16/fp32 model, and to asymmetric int4 precision if it is GPTQ model.')
+group.add_argument('--load-in-low-bit', type=str, default=None, help='Load the model to the specified low-bit precision. Supported options are sym_int4, fp4, fp8, asym_int4, sym_int5, asym_int5, sym_int8, mixed_fp4, mixed_fp8, nf3, nf4, fp8_e4m3, fp16 or bf16. asym_int4 means asymmetric int4, fp8 means 8-bit floating point, mixed_fp8 means mixture of 8-bit quantization, nf4 means 4-bit NormalFloat, etc.')
+group.add_argument('--optimize-model', action='store_true', help='Further optimize the low-bit model with ipex-llm.')
+group.add_argument('--trust-remote-code', action='store_true', help='Set trust_remote_code=True while loading the model. Necessary for some models.')
 
 # DeepSpeed
 group = parser.add_argument_group('DeepSpeed')
diff --git a/modules/ui.py b/modules/ui.py
index b6e1b9fef0..155f5b3cac 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -99,8 +99,7 @@ def list_model_elements():
         'hqq_backend',
         'ipex_llm_load_in_4bit',
         'ipex_llm_load_in_low_bit',
-        'optimize_model',
-        'use_cache',
+        'optimize_model'
     ]
     if is_torch_xpu_available():
         for i in range(torch.xpu.device_count()):
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index fc7c79506a..620297ce92 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -109,16 +109,8 @@ def create_ui():
 
                             shared.gradio['autogptq_info'] = gr.Markdown('ExLlamav2_HF is recommended over AutoGPTQ for models derived from Llama.')
                             shared.gradio['quipsharp_info'] = gr.Markdown('QuIP# has to be installed manually at the moment.')
-                            shared.gradio['ipex_llm_load_in_4bit'] = gr.Checkbox(label="load-in-4bit",
-                                                                        value=shared.args.load_in_4bit,
-                                                                        info="Load the model with symmetric int4 precision.\n\nTo enable this option, start the web UI with the --load-in-4bit flag.",
-                                                                        interactive=shared.args.load_in_4bit)
-                            shared.gradio['ipex_llm_load_in_low_bit'] = gr.Dropdown(label="load-in-low-bit",
-                                                                           choices=["sym_int4", "asym_int4", "sym_int5", "asym_int5", "sym_int8",
-                                                                                    "nf3", "nf4", "fp4", "fp8", "fp8_e4m3", "fp8_e5m2", "fp16", "bf16"],
-                                                                           value=shared.args.load_in_low_bit,
-                                                                           info='Load the model with specified precision.\n\nTo enable this option, start the web UI with the --load-in-low-bit flag.',
-                                                                           interactive=shared.args.load_in_4bit is False)
+                            shared.gradio['ipex_llm_load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit, info="Load the model to symmetric int4 precision.\n\nTo enable this option, start the web UI with the --load-in-4bit flag.", interactive=shared.args.load_in_4bit)
+                            shared.gradio['ipex_llm_load_in_low_bit'] = gr.Dropdown(label="load-in-low-bit", choices=["sym_int4", "fp4", "fp8", "asym_int4", "sym_int5", "asym_int5", "sym_int8", "nf3", "nf4", "fp8_e4m3", "fp16", "bf16"], value=shared.args.load_in_low_bit, info='Load the model to the specified low-bit precision.\n\nTo enable this option, start the web UI with the argument --load-in-low-bit PRECISION.', interactive=not shared.args.load_in_4bit)
 
                         with gr.Column():
                             shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit)
@@ -146,6 +138,7 @@ def create_ui():
                             shared.gradio['no_flash_attn'] = gr.Checkbox(label="no_flash_attn", value=shared.args.no_flash_attn, info='Force flash-attention to not be used.')
                             shared.gradio['cfg_cache'] = gr.Checkbox(label="cfg-cache", value=shared.args.cfg_cache, info='Necessary to use CFG with this loader.')
                             shared.gradio['num_experts_per_token'] = gr.Number(label="Number of experts per token", value=shared.args.num_experts_per_token, info='Only applies to MoE models like Mixtral.')
+                            shared.gradio['optimize_model'] = gr.Checkbox(label="optimize-model", value=shared.args.optimize_model, info="Further optimize the low-bit model with ipex-llm.")
                             with gr.Blocks():
                                 shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Set trust_remote_code=True while loading the tokenizer/model. To enable this option, start the web UI with the --trust-remote-code flag.', interactive=shared.args.trust_remote_code)
                                 shared.gradio['no_use_fast'] = gr.Checkbox(label="no_use_fast", value=shared.args.no_use_fast, info='Set use_fast=False while loading the tokenizer.')
@@ -156,9 +149,6 @@ def create_ui():
                             shared.gradio['gptq_for_llama_info'] = gr.Markdown('Legacy loader for compatibility with older GPUs. ExLlamav2_HF or AutoGPTQ are preferred for GPTQ models when supported.')
                             shared.gradio['exllamav2_info'] = gr.Markdown("ExLlamav2_HF is recommended over ExLlamav2 for better integration with extensions and more consistent sampling behavior across loaders.")
                             shared.gradio['llamacpp_HF_info'] = gr.Markdown("llamacpp_HF loads llama.cpp as a Transformers model. To use it, you need to place your GGUF in a subfolder of models/ with the necessary tokenizer files.\n\nYou can use the \"llamacpp_HF creator\" menu to do that automatically.")
-                            shared.gradio['optimize_model'] = gr.Checkbox(label="optimize-model", value=shared.args.optimize_model, info="Enable this option to further optimize the low-bit llm model.")
-                            shared.gradio['use_cache'] = gr.Checkbox(label="use-cache", value=shared.args.use_cache, info="Wether to use past_key_values to speed up model decoding.")
-                            
             with gr.Column():
                 with gr.Row():
                     shared.gradio['autoload_model'] = gr.Checkbox(value=shared.settings['autoload_model'], label='Autoload the model', info='Whether to load the model as soon as it is selected in the Model dropdown.', interactive=not mu)

From 58f4be09471379ead4a493d2dc4b03e5c8ffca17 Mon Sep 17 00:00:00 2001
From: Kai Huang <huangkaivision@gmail.com>
Date: Mon, 8 Apr 2024 19:52:44 +0800
Subject: [PATCH 3/5] Fix load_in_4bit and load_in_low_bit not taking effect in
 UI (#28)

---
 modules/loaders.py       | 6 +++---
 modules/shared.py        | 5 +++--
 modules/ui.py            | 3 +--
 modules/ui_model_menu.py | 7 +++----
 4 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/modules/loaders.py b/modules/loaders.py
index f0f6522851..5b49fb580b 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -153,10 +153,10 @@
         'no_use_fast',
     ],
     'IPEX-LLM': [
-        'ipex_llm_load_in_4bit',
-        'ipex_llm_load_in_low_bit',
+        'load_in_4bit',
+        'load_in_low_bit',
         'optimize_model',
-        'trust_remote_code',
+        'trust_remote_code'
     ]
 })
 
diff --git a/modules/shared.py b/modules/shared.py
index 63dff02079..13bc09236a 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -108,7 +108,7 @@
 
 # bitsandbytes 4-bit
 group = parser.add_argument_group('bitsandbytes 4-bit')
-group.add_argument('--load-in-4bit', action='store_true', help='Load the model with 4-bit precision (using bitsandbytes).')
+group.add_argument('--load-in-4bit', action='store_true', help='Load the model with 4-bit precision (using bitsandbytes or ipex-llm).')
 group.add_argument('--use_double_quant', action='store_true', help='use_double_quant for 4-bit.')
 group.add_argument('--compute_dtype', type=str, default='float16', help='compute dtype for 4-bit. Valid options: bfloat16, float16, float32.')
 group.add_argument('--quant_type', type=str, default='nf4', help='quant_type for 4-bit. Valid options: nf4, fp4.')
@@ -166,7 +166,8 @@
 
 # IPEX-LLM
 group = parser.add_argument_group('IPEX-LLM')
-group.add_argument('--load-in-4bit', action='store_true', help='Load the model to symmetric int4 precision if it is a regular fp16/bf16/fp32 model, and to asymmetric int4 precision if it is GPTQ model.')
+# --load-in-4bit is the same as bitsandbytes 4-bit's argument
+# group.add_argument('--load-in-4bit', action='store_true', help='Load the model to symmetric int4 precision if it is a regular fp16/bf16/fp32 model, and to asymmetric int4 precision if it is GPTQ model.')
 group.add_argument('--load-in-low-bit', type=str, default=None, help='Load the model to the specified low-bit precision. Supported options are sym_int4, fp4, fp8, asym_int4, sym_int5, asym_int5, sym_int8, mixed_fp4, mixed_fp8, nf3, nf4, fp8_e4m3, fp16 or bf16. asym_int4 means asymmetric int4, fp8 means 8-bit floating point, mixed_fp8 means mixture of 8-bit quantization, nf4 means 4-bit NormalFloat, etc.')
 group.add_argument('--optimize-model', action='store_true', help='Further optimize the low-bit model with ipex-llm.')
 group.add_argument('--trust-remote-code', action='store_true', help='Set trust_remote_code=True while loading the model. Necessary for some models.')
diff --git a/modules/ui.py b/modules/ui.py
index 155f5b3cac..420171e7e0 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -97,8 +97,7 @@ def list_model_elements():
         'row_split',
         'tensorcores',
         'hqq_backend',
-        'ipex_llm_load_in_4bit',
-        'ipex_llm_load_in_low_bit',
+        'load_in_low_bit',
         'optimize_model'
     ]
     if is_torch_xpu_available():
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 620297ce92..eee4886f7f 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -109,12 +109,12 @@ def create_ui():
 
                             shared.gradio['autogptq_info'] = gr.Markdown('ExLlamav2_HF is recommended over AutoGPTQ for models derived from Llama.')
                             shared.gradio['quipsharp_info'] = gr.Markdown('QuIP# has to be installed manually at the moment.')
-                            shared.gradio['ipex_llm_load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit, info="Load the model to symmetric int4 precision.\n\nTo enable this option, start the web UI with the --load-in-4bit flag.", interactive=shared.args.load_in_4bit)
-                            shared.gradio['ipex_llm_load_in_low_bit'] = gr.Dropdown(label="load-in-low-bit", choices=["sym_int4", "fp4", "fp8", "asym_int4", "sym_int5", "asym_int5", "sym_int8", "nf3", "nf4", "fp8_e4m3", "fp16", "bf16"], value=shared.args.load_in_low_bit, info='Load the model to the specified low-bit precision.\n\nTo enable this option, start the web UI with the argument --load-in-low-bit PRECISION.', interactive=not shared.args.load_in_4bit)
+                            shared.gradio['load_in_low_bit'] = gr.Dropdown(label="load-in-low-bit", choices=["sym_int4", "fp4", "fp8", "asym_int4", "sym_int5", "asym_int5", "sym_int8", "nf3", "nf4", "fp8_e4m3", "fp16", "bf16"], value=shared.args.load_in_low_bit, info='Load the model to the specified low-bit precision.', interactive=not shared.args.load_in_4bit)
+                            shared.gradio['optimize_model'] = gr.Checkbox(label="optimize-model", value=shared.args.optimize_model, info="Further optimize the low-bit model with ipex-llm.")
 
                         with gr.Column():
                             shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit)
-                            shared.gradio['load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit)
+                            shared.gradio['load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit, info="Load the model with 4-bit precision.", interactive=not shared.args.load_in_low_bit)
                             shared.gradio['use_double_quant'] = gr.Checkbox(label="use_double_quant", value=shared.args.use_double_quant)
                             shared.gradio['use_flash_attention_2'] = gr.Checkbox(label="use_flash_attention_2", value=shared.args.use_flash_attention_2, info='Set use_flash_attention_2=True while loading the model.')
                             shared.gradio['auto_devices'] = gr.Checkbox(label="auto-devices", value=shared.args.auto_devices)
@@ -138,7 +138,6 @@ def create_ui():
                             shared.gradio['no_flash_attn'] = gr.Checkbox(label="no_flash_attn", value=shared.args.no_flash_attn, info='Force flash-attention to not be used.')
                             shared.gradio['cfg_cache'] = gr.Checkbox(label="cfg-cache", value=shared.args.cfg_cache, info='Necessary to use CFG with this loader.')
                             shared.gradio['num_experts_per_token'] = gr.Number(label="Number of experts per token", value=shared.args.num_experts_per_token, info='Only applies to MoE models like Mixtral.')
-                            shared.gradio['optimize_model'] = gr.Checkbox(label="optimize-model", value=shared.args.optimize_model, info="Further optimize the low-bit model with ipex-llm.")
                             with gr.Blocks():
                                 shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Set trust_remote_code=True while loading the tokenizer/model. To enable this option, start the web UI with the --trust-remote-code flag.', interactive=shared.args.trust_remote_code)
                                 shared.gradio['no_use_fast'] = gr.Checkbox(label="no_use_fast", value=shared.args.no_use_fast, info='Set use_fast=False while loading the tokenizer.')

From 716829040e70696fa79c70a0fc38832c1a27dcd2 Mon Sep 17 00:00:00 2001
From: Kai Huang <huangkaivision@gmail.com>
Date: Mon, 8 Apr 2024 20:15:34 +0800
Subject: [PATCH 4/5] Add dependency to requirements (#32)

---
 requirements.txt                 | 2 ++
 requirements_cpu_only.txt        | 2 ++
 requirements_cpu_only_noavx2.txt | 2 ++
 requirements_noavx2.txt          | 2 ++
 requirements_nowheels.txt        | 2 ++
 5 files changed, 10 insertions(+)

diff --git a/requirements.txt b/requirements.txt
index abbd1a62ec..ad8e522cb5 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -22,6 +22,8 @@ tensorboard
 transformers==4.38.*
 tqdm
 wandb
+py-cpuinfo
+ipex-llm
 
 # bitsandbytes
 bitsandbytes==0.42.*; platform_system != "Windows"
diff --git a/requirements_cpu_only.txt b/requirements_cpu_only.txt
index 19286bf10c..6b0af10997 100644
--- a/requirements_cpu_only.txt
+++ b/requirements_cpu_only.txt
@@ -22,6 +22,8 @@ tensorboard
 transformers==4.38.*
 tqdm
 wandb
+py-cpuinfo
+ipex-llm
 
 # llama-cpp-python (CPU only, AVX2)
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.52+cpuavx2-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
diff --git a/requirements_cpu_only_noavx2.txt b/requirements_cpu_only_noavx2.txt
index a71e4a7c2f..c0a6d4a2b7 100644
--- a/requirements_cpu_only_noavx2.txt
+++ b/requirements_cpu_only_noavx2.txt
@@ -22,6 +22,8 @@ tensorboard
 transformers==4.38.*
 tqdm
 wandb
+py-cpuinfo
+ipex-llm
 
 # llama-cpp-python (CPU only, no AVX2)
 https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.52+cpuavx-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
diff --git a/requirements_noavx2.txt b/requirements_noavx2.txt
index 8244aa5f91..12cfd72e02 100644
--- a/requirements_noavx2.txt
+++ b/requirements_noavx2.txt
@@ -22,6 +22,8 @@ tensorboard
 transformers==4.38.*
 tqdm
 wandb
+py-cpuinfo
+ipex-llm
 
 # bitsandbytes
 bitsandbytes==0.42.*; platform_system != "Windows"
diff --git a/requirements_nowheels.txt b/requirements_nowheels.txt
index 4dc697a69e..3ace31a1a1 100644
--- a/requirements_nowheels.txt
+++ b/requirements_nowheels.txt
@@ -22,3 +22,5 @@ tensorboard
 transformers==4.38.*
 tqdm
 wandb
+py-cpuinfo
+ipex-llm

From 1608c01dd08d3107d640246c455c2c30f9a9cf31 Mon Sep 17 00:00:00 2001
From: Kai Huang <huangkaivision@gmail.com>
Date: Tue, 9 Apr 2024 15:42:09 +0800
Subject: [PATCH 5/5] Remove some arguments (#35)

---
 README.md                | 4 +---
 modules/loaders.py       | 4 +---
 modules/models.py        | 3 +--
 modules/shared.py        | 8 ++------
 modules/ui.py            | 2 --
 modules/ui_model_menu.py | 5 ++---
 6 files changed, 7 insertions(+), 19 deletions(-)

diff --git a/README.md b/README.md
index 670efa51bd..b9eec74682 100644
--- a/README.md
+++ b/README.md
@@ -318,9 +318,7 @@ List of command-line flags
 
 | Flag                                  | Description |
 |---------------------------------------|-------------|
-| `--load-in-4bit`                      | Load the model to symmetric int4 precision. This option is mutually exclusive with `--load-in-low-bit`. |
-| `--load-in-low-bit PRECISION`         | Load the model to the specified low-bit precision. Supported options are `sym_int4`, `fp4`, `fp8`, `asym_int4`, `sym_int5`, `asym_int5`, `sym_int8`, `mixed_fp4`, `mixed_fp8`, `nf3`, `nf4`, `fp8_e4m3`, `fp16` and `bf16`. asym_int4 means asymmetric int4, fp8 means 8-bit floating point, mixed_fp8 means mixture of 8-bit quantization, nf4 means 4-bit NormalFloat, etc. This option is mutually exclusive with `--load-in-4bit`.|
-| `--optimize-model`                    | Further optimize the low-bit model with ipex-llm.
+| `--load-in-4bit`                      | Load the model to symmetric int4 precision with ipex-llm optimizations. |
 | `--trust-remote-code`                 | Set `trust_remote_code=True` while loading the model. Necessary for some models. |
 
 #### DeepSpeed
diff --git a/modules/loaders.py b/modules/loaders.py
index daf2ed7f16..401d24994d 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -151,9 +151,7 @@
     ],
     'IPEX-LLM': [
         'load_in_4bit',
-        'load_in_low_bit',
-        'optimize_model',
-        'trust_remote_code'
+        'trust_remote_code',
     ]
 })
 
diff --git a/modules/models.py b/modules/models.py
index b75abb9e06..c02a255744 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -397,8 +397,7 @@ def ipex_llm_loader(model_name):
     model = LoaderClass.from_pretrained(
         path_to_model,
         load_in_4bit=shared.args.load_in_4bit,
-        load_in_low_bit=shared.args.load_in_low_bit,
-        optimize_model=shared.args.optimize_model,
+        optimize_model=True,
         trust_remote_code=shared.args.trust_remote_code,
         use_cache=True)
 
diff --git a/modules/shared.py b/modules/shared.py
index fdda3ce332..e2c12e3b88 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -166,12 +166,8 @@
 group.add_argument('--hqq-backend', type=str, default='PYTORCH_COMPILE', help='Backend for the HQQ loader. Valid options: PYTORCH, PYTORCH_COMPILE, ATEN.')
 
 # IPEX-LLM
-group = parser.add_argument_group('IPEX-LLM')
-# --load-in-4bit is the same as bitsandbytes 4-bit's argument
-# group.add_argument('--load-in-4bit', action='store_true', help='Load the model to symmetric int4 precision if it is a regular fp16/bf16/fp32 model, and to asymmetric int4 precision if it is GPTQ model.')
-group.add_argument('--load-in-low-bit', type=str, default=None, help='Load the model to the specified low-bit precision. Supported options are sym_int4, fp4, fp8, asym_int4, sym_int5, asym_int5, sym_int8, mixed_fp4, mixed_fp8, nf3, nf4, fp8_e4m3, fp16 or bf16. asym_int4 means asymmetric int4, fp8 means 8-bit floating point, mixed_fp8 means mixture of 8-bit quantization, nf4 means 4-bit NormalFloat, etc.')
-group.add_argument('--optimize-model', action='store_true', help='Further optimize the low-bit model with ipex-llm.')
-group.add_argument('--trust-remote-code', action='store_true', help='Set trust_remote_code=True while loading the model. Necessary for some models.')
+# --load-in-4bit is the same as bitsandbytes 4-bit
+# --trust-remote-code is the same as Transformers
 
 # DeepSpeed
 group = parser.add_argument_group('DeepSpeed')
diff --git a/modules/ui.py b/modules/ui.py
index 00095f3f20..56b1518c7a 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -101,8 +101,6 @@ def list_model_elements():
         'streaming_llm',
         'attention_sink_size',
         'hqq_backend',
-        'load_in_low_bit',
-        'optimize_model'
     ]
     if is_torch_xpu_available():
         for i in range(torch.xpu.device_count()):
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index a736264eb2..52636de296 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -109,12 +109,10 @@ def create_ui():
 
                             shared.gradio['autogptq_info'] = gr.Markdown('ExLlamav2_HF is recommended over AutoGPTQ for models derived from Llama.')
                             shared.gradio['quipsharp_info'] = gr.Markdown('QuIP# has to be installed manually at the moment.')
-                            shared.gradio['load_in_low_bit'] = gr.Dropdown(label="load-in-low-bit", choices=["sym_int4", "fp4", "fp8", "asym_int4", "sym_int5", "asym_int5", "sym_int8", "nf3", "nf4", "fp8_e4m3", "fp16", "bf16"], value=shared.args.load_in_low_bit, info='Load the model to the specified low-bit precision.', interactive=not shared.args.load_in_4bit)
-                            shared.gradio['optimize_model'] = gr.Checkbox(label="optimize-model", value=shared.args.optimize_model, info="Further optimize the low-bit model with ipex-llm.")
 
                         with gr.Column():
                             shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit)
-                            shared.gradio['load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit, info="Load the model with 4-bit precision.", interactive=not shared.args.load_in_low_bit)
+                            shared.gradio['load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit, info="Load the model with 4-bit precision.")
                             shared.gradio['use_double_quant'] = gr.Checkbox(label="use_double_quant", value=shared.args.use_double_quant)
                             shared.gradio['use_flash_attention_2'] = gr.Checkbox(label="use_flash_attention_2", value=shared.args.use_flash_attention_2, info='Set use_flash_attention_2=True while loading the model.')
                             shared.gradio['auto_devices'] = gr.Checkbox(label="auto-devices", value=shared.args.auto_devices)
@@ -151,6 +149,7 @@ def create_ui():
                             shared.gradio['gptq_for_llama_info'] = gr.Markdown('Legacy loader for compatibility with older GPUs. ExLlamav2_HF or AutoGPTQ are preferred for GPTQ models when supported.')
                             shared.gradio['exllamav2_info'] = gr.Markdown("ExLlamav2_HF is recommended over ExLlamav2 for better integration with extensions and more consistent sampling behavior across loaders.")
                             shared.gradio['llamacpp_HF_info'] = gr.Markdown("llamacpp_HF loads llama.cpp as a Transformers model. To use it, you need to place your GGUF in a subfolder of models/ with the necessary tokenizer files.\n\nYou can use the \"llamacpp_HF creator\" menu to do that automatically.")
+
             with gr.Column():
                 with gr.Row():
                     shared.gradio['autoload_model'] = gr.Checkbox(value=shared.settings['autoload_model'], label='Autoload the model', info='Whether to load the model as soon as it is selected in the Model dropdown.', interactive=not mu)