diff --git a/README.md b/README.md
index 9f3e81bdba..b9eec74682 100644
--- a/README.md
+++ b/README.md
@@ -314,6 +314,13 @@ List of command-line flags
 |-------------|-------------|
 | `--hqq-backend` | Backend for the HQQ loader. Valid options: PYTORCH, PYTORCH_COMPILE, ATEN. |
 
+#### IPEX-LLM
+
+| Flag                                  | Description |
+|---------------------------------------|-------------|
+| `--load-in-4bit`                      | Load the model to symmetric int4 precision with ipex-llm optimizations. |
+| `--trust-remote-code`                 | Set `trust_remote_code=True` while loading the model. Necessary for some models. |
+
 #### DeepSpeed
 
 | Flag                                  | Description |
diff --git a/modules/loaders.py b/modules/loaders.py
index 234773397d..401d24994d 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -148,6 +148,10 @@
         'hqq_backend',
         'trust_remote_code',
         'no_use_fast',
+    ],
+    'IPEX-LLM': [
+        'load_in_4bit',
+        'trust_remote_code',
     ]
 })
 
@@ -203,6 +207,7 @@ def transformers_samplers():
     'AutoAWQ': transformers_samplers(),
     'QuIP#': transformers_samplers(),
     'HQQ': transformers_samplers(),
+    'IPEX-LLM': transformers_samplers(),
     'ExLlamav2': {
         'temperature',
         'temperature_last',
diff --git a/modules/models.py b/modules/models.py
index 1519fc8941..c02a255744 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -71,6 +71,7 @@ def load_model(model_name, loader=None):
         'AutoAWQ': AutoAWQ_loader,
         'QuIP#': QuipSharp_loader,
         'HQQ': HQQ_loader,
+        'IPEX-LLM': ipex_llm_loader,
     }
 
     metadata = get_model_metadata(model_name)
@@ -376,6 +377,35 @@ def HQQ_loader(model_name):
     return model
 
 
+def ipex_llm_loader(model_name):
+
+    from ipex_llm.transformers import AutoModelForCausalLM, AutoModel, AutoModelForSeq2SeqLM
+
+    path_to_model = Path(f'{shared.args.model_dir}/{model_name}')
+
+    config = AutoConfig.from_pretrained(path_to_model, trust_remote_code=shared.args.trust_remote_code)
+
+    if 'chatglm' in model_name.lower():
+        LoaderClass = AutoModel
+    else:
+        if config.to_dict().get('is_encoder_decoder', False):
+            LoaderClass = AutoModelForSeq2SeqLM
+            shared.is_seq2seq = True
+        else:
+            LoaderClass = AutoModelForCausalLM
+
+    model = LoaderClass.from_pretrained(
+        path_to_model,
+        load_in_4bit=shared.args.load_in_4bit,
+        optimize_model=True,
+        trust_remote_code=shared.args.trust_remote_code,
+        use_cache=True)
+
+    tokenizer = AutoTokenizer.from_pretrained(path_to_model, trust_remote_code=shared.args.trust_remote_code)
+
+    return model, tokenizer
+
+
 def get_max_memory_dict():
     max_memory = {}
     max_cpu_memory = shared.args.cpu_memory.strip() if shared.args.cpu_memory is not None else '99GiB'
diff --git a/modules/shared.py b/modules/shared.py
index 46b6ef478f..e2c12e3b88 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -106,7 +106,7 @@
 
 # bitsandbytes 4-bit
 group = parser.add_argument_group('bitsandbytes 4-bit')
-group.add_argument('--load-in-4bit', action='store_true', help='Load the model with 4-bit precision (using bitsandbytes).')
+group.add_argument('--load-in-4bit', action='store_true', help='Load the model with 4-bit precision (using bitsandbytes or ipex-llm).')
 group.add_argument('--use_double_quant', action='store_true', help='use_double_quant for 4-bit.')
 group.add_argument('--compute_dtype', type=str, default='float16', help='compute dtype for 4-bit. Valid options: bfloat16, float16, float32.')
 group.add_argument('--quant_type', type=str, default='nf4', help='quant_type for 4-bit. Valid options: nf4, fp4.')
@@ -165,6 +165,10 @@
 group = parser.add_argument_group('HQQ')
 group.add_argument('--hqq-backend', type=str, default='PYTORCH_COMPILE', help='Backend for the HQQ loader. Valid options: PYTORCH, PYTORCH_COMPILE, ATEN.')
 
+# IPEX-LLM
+# --load-in-4bit is the same as bitsandbytes 4-bit
+# --trust-remote-code is the same as Transformers
+
 # DeepSpeed
 group = parser.add_argument_group('DeepSpeed')
 group.add_argument('--deepspeed', action='store_true', help='Enable the use of DeepSpeed ZeRO-3 for inference via the Transformers integration.')
@@ -263,6 +267,8 @@ def fix_loader_name(name):
         return 'QuIP#'
     elif name in ['hqq']:
         return 'HQQ'
+    elif name in ['IPEX-LLM', 'ipex-llm']:
+        return 'IPEX-LLM'
 
 
 def add_extension(name, last=False):
diff --git a/modules/text_generation.py b/modules/text_generation.py
index f99c605e13..e3d3ceded3 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -132,6 +132,8 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt
         return input_ids
     elif shared.args.deepspeed:
         return input_ids.to(device=local_rank)
+    elif shared.args.loader == 'IPEX-LLM':
+        return input_ids
     elif torch.backends.mps.is_available():
         device = torch.device('mps')
         return input_ids.to(device)
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 8d6122d252..52636de296 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -112,7 +112,7 @@ def create_ui():
 
                         with gr.Column():
                             shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit)
-                            shared.gradio['load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit)
+                            shared.gradio['load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit, info="Load the model with 4-bit precision.")
                             shared.gradio['use_double_quant'] = gr.Checkbox(label="use_double_quant", value=shared.args.use_double_quant)
                             shared.gradio['use_flash_attention_2'] = gr.Checkbox(label="use_flash_attention_2", value=shared.args.use_flash_attention_2, info='Set use_flash_attention_2=True while loading the model.')
                             shared.gradio['auto_devices'] = gr.Checkbox(label="auto-devices", value=shared.args.auto_devices)
diff --git a/requirements.txt b/requirements.txt
index 17f7796644..acaf5c044c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -26,6 +26,8 @@ tensorboard
 transformers==4.39.*
 tqdm
 wandb
+py-cpuinfo
+ipex-llm
 
 # API
 SpeechRecognition==3.10.0
diff --git a/requirements_cpu_only.txt b/requirements_cpu_only.txt
index b6e2a944b4..5425f77da6 100644
--- a/requirements_cpu_only.txt
+++ b/requirements_cpu_only.txt
@@ -24,6 +24,8 @@ tensorboard
 transformers==4.39.*
 tqdm
 wandb
+py-cpuinfo
+ipex-llm
 
 # API
 SpeechRecognition==3.10.0
diff --git a/requirements_cpu_only_noavx2.txt b/requirements_cpu_only_noavx2.txt
index 21c0f68832..361c9220fb 100644
--- a/requirements_cpu_only_noavx2.txt
+++ b/requirements_cpu_only_noavx2.txt
@@ -24,6 +24,8 @@ tensorboard
 transformers==4.39.*
 tqdm
 wandb
+py-cpuinfo
+ipex-llm
 
 # API
 SpeechRecognition==3.10.0
diff --git a/requirements_noavx2.txt b/requirements_noavx2.txt
index fec052c3fd..d26080223d 100644
--- a/requirements_noavx2.txt
+++ b/requirements_noavx2.txt
@@ -26,6 +26,8 @@ tensorboard
 transformers==4.39.*
 tqdm
 wandb
+py-cpuinfo
+ipex-llm
 
 # API
 SpeechRecognition==3.10.0
diff --git a/requirements_nowheels.txt b/requirements_nowheels.txt
index e85a19bd54..98439fc193 100644
--- a/requirements_nowheels.txt
+++ b/requirements_nowheels.txt
@@ -24,6 +24,8 @@ tensorboard
 transformers==4.39.*
 tqdm
 wandb
+py-cpuinfo
+ipex-llm
 
 # API
 SpeechRecognition==3.10.0