huggingface · IlyasMoutawwakil · Feb 19, 2025 · Dec 9, 2024 · Dec 9, 2024 · Dec 9, 2024
diff --git a/.github/workflows/test_ipex.yml b/.github/workflows/test_ipex.yml
@@ -30,14 +30,20 @@ jobs:
       - name: Setup Python
         uses: actions/setup-python@v5
         with:
-          python-version: 3.9
+          python-version: "3.10"
 
       - name: Install dependencies
         run: |
           pip install --upgrade pip
           pip install torch==${{ matrix.torch-version }} torchaudio torchvision --extra-index-url https://download.pytorch.org/whl/cpu
           pip install .[ipex,tests] transformers[testing]==${{ matrix.transformers-version }} intel_extension_for_pytorch==${{ matrix.torch-version }}
 
+      - name: Install bitsandbytes
+        run: |
+          git clone --branch multi-backend-refactor https://github.com/bitsandbytes-foundation/bitsandbytes.git
+          cd bitsandbytes
+          pip install .
+
       - name: Assert versions
         run: |
           python -c "import torch; print(torch.__version__); assert torch.__version__.startswith('${{ matrix.torch-version }}'.replace('.*', ''))"

diff --git a/optimum/exporters/ipex/model_patcher.py b/optimum/exporters/ipex/model_patcher.py
@@ -14,7 +14,7 @@
 
 from transformers.models.bert.modeling_bert import BertIntermediate
 from transformers.models.falcon.modeling_falcon import FalconDecoderLayer, FalconModel
-from transformers.models.gpt2.modeling_gpt2 import GPT2MLP, GPT2Attention, GPT2Block, GPT2Model
+from transformers.models.gpt2.modeling_gpt2 import GPT2Block, GPT2Model
 from transformers.models.llama.modeling_llama import (
     LlamaDecoderLayer,
     LlamaModel,
@@ -32,13 +32,11 @@
 
 from .modeling_utils import (
     _IPEX_MINIMUM_VERSION_FOR_PATCHING,
-    _IPEXGPT2MLP,
     _falcon_model_forward,
-    _gpt2_block_forward,
     _gpt2_model_forward,
     _ipex_rms_layer_norm_forward,
     _IPEXFalconDecoderLayer,
-    _IPEXGPT2Attention,
+    _IPEXGPT2Block,
     _IPEXIntermediate,
     _IPEXLlamaDecoderLayer,
     _IPEXQwen2DecoderLayer,
@@ -66,12 +64,12 @@ def convert_functions(m, target_m, new_function_name, new_function):
         convert_functions(sub_m, target_m, new_function_name, new_function)
 
 
-def convert_class(m, target_m, new_class, config=None):
+def convert_class(m, target_m, new_class, device, config):
     for name, sub_m in m.named_children():
         if isinstance(sub_m, target_m):
-            new_m = new_class(sub_m, config)
+            new_m = new_class(sub_m, device, config)
             setattr(m, name, new_m)
-        convert_class(sub_m, target_m, new_class, config)
+        convert_class(sub_m, target_m, new_class, device, config)
 
 
 def patch_op(m, target_m, new_op_name, new_op):
@@ -89,7 +87,7 @@ def _patch_llama_model(model):
     """
     convert_functions(model, LlamaModel, "forward", _llama_model_forward)
     convert_functions(model, LlamaRMSNorm, "forward", _ipex_rms_layer_norm_forward)
-    convert_class(model, LlamaDecoderLayer, _IPEXLlamaDecoderLayer, model.config)
+    convert_class(model, LlamaDecoderLayer, _IPEXLlamaDecoderLayer, model.device, model.config)
     return model
 
 
@@ -105,21 +103,20 @@ def _patch_falcon_model(model):
     setattr(model.config, "num_key_value_heads", num_key_value_heads)
     convert_functions(model, FalconModel, "forward", _falcon_model_forward)
     replace_customized_linear_with_linear(model)
-    convert_class(model, FalconDecoderLayer, _IPEXFalconDecoderLayer, model.config)
+    convert_class(model, FalconDecoderLayer, _IPEXFalconDecoderLayer, model.device, model.config)
     return model
 
 
 def _patch_gpt2_model(model):
     """
     Patch gpt2 model:
         1. Use IPEX paged attention
+        2. Linear fusion with (Linear + Add)
     """
     num_key_value_heads = model.config.num_attention_heads
     setattr(model.config, "num_key_value_heads", num_key_value_heads)
     convert_functions(model, GPT2Model, "forward", _gpt2_model_forward)
-    convert_functions(model, GPT2Block, "forward", _gpt2_block_forward)
-    convert_class(model, GPT2Attention, _IPEXGPT2Attention, model.config)
-    convert_class(model, GPT2MLP, _IPEXGPT2MLP, model.config)
+    convert_class(model, GPT2Block, _IPEXGPT2Block, model.device, model.config)
     return model
 
 
@@ -131,7 +128,7 @@ def _patch_qwen2_model(model):
     """
     convert_functions(model, Qwen2Model, "forward", _qwen2_model_forward)
     convert_functions(model, Qwen2RMSNorm, "forward", _ipex_rms_layer_norm_forward)
-    convert_class(model, Qwen2DecoderLayer, _IPEXQwen2DecoderLayer, model.config)
+    convert_class(model, Qwen2DecoderLayer, _IPEXQwen2DecoderLayer, model.device, model.config)
     return model
 
 
@@ -140,7 +137,7 @@ def _patch_bert_model(model):
     Patch bert model:
         1. Linear fusion with Linear + Gelu
     """
-    convert_class(model, BertIntermediate, _IPEXIntermediate)
+    convert_class(model, BertIntermediate, _IPEXIntermediate, model.device, model.config)
     return model
 
 
@@ -149,7 +146,7 @@ def _patch_vit_model(model):
     Patch vit model:
         1. Linear fusion with Linear + Gelu
     """
-    convert_class(model, ViTIntermediate, _IPEXIntermediate)
+    convert_class(model, ViTIntermediate, _IPEXIntermediate, model.device, model.config)
     return model