From cc2f77c4977c37ed8a86fc76f7a30a036a005449 Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Fri, 13 Dec 2024 13:52:18 +0100
Subject: [PATCH 01/30] protect token

---
 optimum_benchmark/backends/config.py | 33 ++++++++++++--------
 optimum_benchmark/task_utils.py      | 45 +++++++++++++++++++++-------
 2 files changed, 56 insertions(+), 22 deletions(-)

diff --git a/optimum_benchmark/backends/config.py b/optimum_benchmark/backends/config.py
index fc265d4d..c47b7366 100644
--- a/optimum_benchmark/backends/config.py
+++ b/optimum_benchmark/backends/config.py
@@ -22,13 +22,13 @@ class BackendConfig(ABC):
     version: str
     _target_: str
 
+    model: Optional[str] = None
+    processor: Optional[str] = None
+
     task: Optional[str] = None
     library: Optional[str] = None
     model_type: Optional[str] = None
 
-    model: Optional[str] = None
-    processor: Optional[str] = None
-
     device: Optional[str] = None
     # we use a string here instead of a list
     # because it's easier to pass in a yaml or from cli
@@ -48,30 +48,44 @@ def __post_init__(self):
         if self.model is None:
             raise ValueError("`model` must be specified.")
 
+        if self.model_kwargs.get("token", None) is not None:
+            LOGGER.info(
+                "You have passed an argument `token` to `model_kwargs`. This is dangerous as the config cannot do encryption to protect it. "
+                "We will proceed to registering `token` in the environment as `HF_TOKEN` to avoid saving it or pushing it to the hub by mistake."
+            )
+            os.environ["HF_TOKEN"] = self.model_kwargs.pop("token")
+
         if self.processor is None:
             self.processor = self.model
 
-        # TODO: add cache_dir, token, etc. to these methods
+        if not self.processor_kwargs:
+            self.processor_kwargs = self.model_kwargs
+
         if self.library is None:
             self.library = infer_library_from_model_name_or_path(
                 model_name_or_path=self.model,
-                token=self.model_kwargs.get("token", None),
                 revision=self.model_kwargs.get("revision", None),
+                cache_dir=self.model_kwargs.get("cache_dir", None),
+            )
+
+        if self.library not in ["transformers", "diffusers", "timm", "llama_cpp"]:
+            raise ValueError(
+                f"`library` must be either `transformers`, `diffusers`, `timm` or `llama_cpp`, but got {self.library}"
             )
 
         if self.task is None:
             self.task = infer_task_from_model_name_or_path(
                 model_name_or_path=self.model,
-                token=self.model_kwargs.get("token", None),
                 revision=self.model_kwargs.get("revision", None),
+                cache_dir=self.model_kwargs.get("cache_dir", None),
                 library_name=self.library,
             )
 
         if self.model_type is None:
             self.model_type = infer_model_type_from_model_name_or_path(
                 model_name_or_path=self.model,
-                token=self.model_kwargs.get("token", None),
                 revision=self.model_kwargs.get("revision", None),
+                cache_dir=self.model_kwargs.get("cache_dir", None),
                 library_name=self.library,
             )
 
@@ -103,11 +117,6 @@ def __post_init__(self):
             else:
                 raise RuntimeError("CUDA device is only supported on systems with NVIDIA or ROCm drivers.")
 
-        if self.library not in ["transformers", "diffusers", "timm", "llama_cpp"]:
-            raise ValueError(
-                f"`library` must be either `transformers`, `diffusers`, `timm` or `llama_cpp`, but got {self.library}"
-            )
-
         if self.inter_op_num_threads is not None:
             if self.inter_op_num_threads == -1:
                 self.inter_op_num_threads = cpu_count()
diff --git a/optimum_benchmark/task_utils.py b/optimum_benchmark/task_utils.py
index 7c066d14..45e3a342 100644
--- a/optimum_benchmark/task_utils.py
+++ b/optimum_benchmark/task_utils.py
@@ -155,7 +155,11 @@ def is_local_dir_repo(model_name_or_path: str) -> bool:
 
 
 def get_repo_config(
-    model_name_or_path: str, config_name: str, token: Optional[str] = None, revision: Optional[str] = None
+    model_name_or_path: str,
+    config_name: str,
+    token: Optional[str] = None,
+    revision: Optional[str] = None,
+    cache_dir: Optional[str] = None,
 ):
     if is_hf_hub_repo(model_name_or_path, token=token):
         config = json.load(
@@ -163,6 +167,7 @@ def get_repo_config(
                 huggingface_hub.hf_hub_download(
                     repo_id=model_name_or_path,
                     filename=config_name,
+                    cache_dir=cache_dir,
                     revision=revision,
                     token=token,
                 ),
@@ -197,6 +202,7 @@ def infer_library_from_model_name_or_path(
     model_name_or_path: str,
     token: Optional[str] = None,
     revision: Optional[str] = None,
+    cache_dir: Optional[str] = None,
 ) -> str:
     inferred_library_name = None
 
@@ -209,7 +215,9 @@ def infer_library_from_model_name_or_path(
         inferred_library_name = "sentence-transformers"
 
     elif "config.json" in repo_files:
-        config_dict = get_repo_config(model_name_or_path, "config.json", token=token, revision=revision)
+        config_dict = get_repo_config(
+            model_name_or_path, "config.json", token=token, revision=revision, cache_dir=cache_dir
+        )
 
         if "pretrained_cfg" in config_dict:
             inferred_library_name = "timm"
@@ -229,12 +237,15 @@ def infer_task_from_model_name_or_path(
     model_name_or_path: str,
     token: Optional[str] = None,
     revision: Optional[str] = None,
+    cache_dir: Optional[str] = None,
     library_name: Optional[str] = None,
 ) -> str:
     inferred_task_name = None
 
     if library_name is None:
-        library_name = infer_library_from_model_name_or_path(model_name_or_path, revision=revision, token=token)
+        library_name = infer_library_from_model_name_or_path(
+            model_name_or_path, revision=revision, token=token, cache_dir=cache_dir
+        )
 
     if library_name == "llama_cpp":
         inferred_task_name = "text-generation"
@@ -243,7 +254,9 @@ def infer_task_from_model_name_or_path(
         inferred_task_name = "image-classification"
 
     elif library_name == "transformers":
-        transformers_config = get_repo_config(model_name_or_path, "config.json", token=token, revision=revision)
+        transformers_config = get_repo_config(
+            model_name_or_path, "config.json", token=token, revision=revision, cache_dir=cache_dir
+        )
         target_class_name = transformers_config["architectures"][0]
 
         for task_name, model_mapping in TASKS_TO_MODEL_TYPES_TO_MODEL_CLASS_NAMES.items():
@@ -258,7 +271,9 @@ def infer_task_from_model_name_or_path(
             raise KeyError(f"Could not find the proper task name for target class name {target_class_name}.")
 
     elif library_name == "diffusers":
-        diffusers_config = get_repo_config(model_name_or_path, "model_index.json", token=token, revision=revision)
+        diffusers_config = get_repo_config(
+            model_name_or_path, "model_index.json", token=token, revision=revision, cache_dir=cache_dir
+        )
         target_class_name = diffusers_config["_class_name"]
 
         for task_name, pipeline_mapping in TASKS_TO_PIPELINE_TYPES_TO_PIPELINE_CLASS_NAMES.items():
@@ -279,26 +294,35 @@ def infer_model_type_from_model_name_or_path(
     model_name_or_path: str,
     token: Optional[str] = None,
     revision: Optional[str] = None,
+    cache_dir: Optional[str] = None,
     library_name: Optional[str] = None,
 ) -> str:
     inferred_model_type = None
 
     if library_name is None:
-        library_name = infer_library_from_model_name_or_path(model_name_or_path, revision=revision, token=token)
+        library_name = infer_library_from_model_name_or_path(
+            model_name_or_path, revision=revision, token=token, cache_dir=cache_dir
+        )
 
     if library_name == "llama_cpp":
         inferred_model_type = "llama_cpp"
 
     elif library_name == "timm":
-        timm_config = get_repo_config(model_name_or_path, "config.json", token=token, revision=revision)
+        timm_config = get_repo_config(
+            model_name_or_path, "config.json", token=token, revision=revision, cache_dir=cache_dir
+        )
         inferred_model_type = timm_config["architecture"]
 
     elif library_name == "transformers":
-        transformers_config = get_repo_config(model_name_or_path, "config.json", token=token, revision=revision)
+        transformers_config = get_repo_config(
+            model_name_or_path, "config.json", token=token, revision=revision, cache_dir=cache_dir
+        )
         inferred_model_type = transformers_config["model_type"]
 
     elif library_name == "diffusers":
-        diffusers_config = get_repo_config(model_name_or_path, "model_index.json", token=token, revision=revision)
+        diffusers_config = get_repo_config(
+            model_name_or_path, "model_index.json", token=token, revision=revision, cache_dir=cache_dir
+        )
         target_class_name = diffusers_config["_class_name"]
 
         for _, pipeline_mapping in TASKS_TO_PIPELINE_TYPES_TO_PIPELINE_CLASS_NAMES.items():
@@ -310,6 +334,7 @@ def infer_model_type_from_model_name_or_path(
                 break
 
         if inferred_model_type is None:
-            raise KeyError(f"Could not find the proper model type for target class name {target_class_name}.")
+            # we use the class name in this case
+            inferred_model_type = target_class_name.replace("DiffusionPipeline", "").replace("Pipeline", "")
 
     return inferred_model_type

From 0b2f878ea24a4c31922e687ab0d93fab7acfaee5 Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Fri, 13 Dec 2024 14:35:41 +0100
Subject: [PATCH 02/30] no_weights for TXI

---
 optimum_benchmark/backends/py_txi/backend.py     |  4 +---
 optimum_benchmark/backends/py_txi/config.py      | 15 ++++++---------
 tests/configs/cpu_inference_py_txi_gpt2.yaml     |  1 +
 tests/configs/cpu_inference_py_txi_st_bert.yaml  |  1 +
 tests/configs/cuda_inference_py_txi_gpt2.yaml    |  1 +
 tests/configs/cuda_inference_py_txi_st_bert.yaml |  1 +
 6 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/optimum_benchmark/backends/py_txi/backend.py b/optimum_benchmark/backends/py_txi/backend.py
index 6e637a31..1b02277a 100644
--- a/optimum_benchmark/backends/py_txi/backend.py
+++ b/optimum_benchmark/backends/py_txi/backend.py
@@ -53,7 +53,6 @@ def download_pretrained_model(self) -> None:
     def prepare_generation_config(self) -> None:
         self.generation_config.eos_token_id = None
         self.generation_config.pad_token_id = None
-
         model_cache_folder = f"models/{self.config.model}".replace("/", "--")
         model_cache_path = f"{self.volume}/{model_cache_folder}"
         snapshot_file = f"{model_cache_path}/refs/{self.config.model_kwargs.get('revision', 'main')}"
@@ -95,8 +94,7 @@ def create_no_weights_model(self) -> None:
 
     def load_model_with_no_weights(self) -> None:
         original_volumes, self.config.volumes = self.config.volumes, {self.tmpdir.name: {"bind": "/data", "mode": "rw"}}
-        original_model, self.config.model = self.config.model, "/data/no_weights_model"
-        self.logger.info("\t+ Loading no weights model")
+        original_model, self.config.model = self.config.model, "/data/no_weights_model/"
         self.load_model_from_pretrained()
         self.config.model, self.config.volumes = original_model, original_volumes
 
diff --git a/optimum_benchmark/backends/py_txi/config.py b/optimum_benchmark/backends/py_txi/config.py
index dae410c4..2bf6c04c 100644
--- a/optimum_benchmark/backends/py_txi/config.py
+++ b/optimum_benchmark/backends/py_txi/config.py
@@ -22,7 +22,7 @@ class PyTXIConfig(BackendConfig):
     # Image to use for the container
     image: Optional[str] = None
     # Shared memory size for the container
-    shm_size: str = "1g"
+    shm_size: Optional[str] = None
     # List of custom devices to forward to the container e.g. ["/dev/kfd", "/dev/dri"] for ROCm
     devices: Optional[List[str]] = None
     # NVIDIA-docker GPU device options e.g. "all" (all) or "0,1,2,3" (ids) or 4 (count)
@@ -41,9 +41,13 @@ class PyTXIConfig(BackendConfig):
         metadata={"help": "List of environment variables to forward to the container from the host."},
     )
 
+    # first connection/request
+    connection_timeout: int = 60
+    first_request_timeout: int = 60
+    max_concurrent_requests: Optional[int] = None
+
     # Common options
     dtype: Optional[str] = None
-    max_concurrent_requests: Optional[int] = None
 
     # TGI specific
     sharded: Optional[str] = None
@@ -72,13 +76,6 @@ def __post_init__(self):
             renderDs = [file for file in os.listdir("/dev/dri") if file.startswith("renderD")]
             self.devices = ["/dev/kfd"] + [f"/dev/dri/{renderDs[i]}" for i in ids]
 
-        # Common options
-        if self.max_concurrent_requests is None:
-            if self.task in TEXT_GENERATION_TASKS:
-                self.max_concurrent_requests = 128
-            elif self.task in TEXT_EMBEDDING_TASKS:
-                self.max_concurrent_requests = 512
-
         # TGI specific
         if self.task in TEXT_GENERATION_TASKS:
             if self.trust_remote_code is None:
diff --git a/tests/configs/cpu_inference_py_txi_gpt2.yaml b/tests/configs/cpu_inference_py_txi_gpt2.yaml
index 76e90775..1aef598e 100644
--- a/tests/configs/cpu_inference_py_txi_gpt2.yaml
+++ b/tests/configs/cpu_inference_py_txi_gpt2.yaml
@@ -3,6 +3,7 @@ defaults:
   - _base_ # inherits from base config
   - _cpu_ # inherits from cpu config
   - _inference_ # inherits from inference config
+  - _no_weights_ # inherits from no weights config
   - _gpt2_ # inherits from gpt2 config
   - _self_ # hydra 1.1 compatibility
   - override backend: py-txi
diff --git a/tests/configs/cpu_inference_py_txi_st_bert.yaml b/tests/configs/cpu_inference_py_txi_st_bert.yaml
index 2650e1bf..99e571b5 100644
--- a/tests/configs/cpu_inference_py_txi_st_bert.yaml
+++ b/tests/configs/cpu_inference_py_txi_st_bert.yaml
@@ -3,6 +3,7 @@ defaults:
   - _base_ # inherits from base config
   - _cpu_ # inherits from cpu config
   - _inference_ # inherits from inference config
+  - _no_weights_ # inherits from no weights config
   - _st_bert_ # inherits from bert config
   - _self_ # hydra 1.1 compatibility
   - override backend: py-txi
diff --git a/tests/configs/cuda_inference_py_txi_gpt2.yaml b/tests/configs/cuda_inference_py_txi_gpt2.yaml
index 73a5c10a..1c93ac36 100644
--- a/tests/configs/cuda_inference_py_txi_gpt2.yaml
+++ b/tests/configs/cuda_inference_py_txi_gpt2.yaml
@@ -3,6 +3,7 @@ defaults:
   - _base_ # inherits from base config
   - _cuda_ # inherits from cuda config
   - _inference_ # inherits from inference config
+  - _no_weights_ # inherits from no weights config
   - _gpt2_ # inherits from gpt2 config
   - _self_ # hydra 1.1 compatibility
   - override backend: py-txi
diff --git a/tests/configs/cuda_inference_py_txi_st_bert.yaml b/tests/configs/cuda_inference_py_txi_st_bert.yaml
index 8ae494e7..5bb38528 100644
--- a/tests/configs/cuda_inference_py_txi_st_bert.yaml
+++ b/tests/configs/cuda_inference_py_txi_st_bert.yaml
@@ -3,6 +3,7 @@ defaults:
   - _base_ # inherits from base config
   - _cuda_ # inherits from cuda config
   - _inference_ # inherits from inference config
+  - _no_weights_ # inherits from no weights config
   - _st_bert_ # inherits from bert config
   - _self_ # hydra 1.1 compatibility
   - override backend: py-txi

From 6fb84c6eb4dd3f90e67ee56a27a8a6315cf925ea Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Fri, 13 Dec 2024 14:44:09 +0100
Subject: [PATCH 03/30] fix

---
 optimum_benchmark/backends/py_txi/config.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/optimum_benchmark/backends/py_txi/config.py b/optimum_benchmark/backends/py_txi/config.py
index 2bf6c04c..9cc23a64 100644
--- a/optimum_benchmark/backends/py_txi/config.py
+++ b/optimum_benchmark/backends/py_txi/config.py
@@ -76,7 +76,9 @@ def __post_init__(self):
             renderDs = [file for file in os.listdir("/dev/dri") if file.startswith("renderD")]
             self.devices = ["/dev/kfd"] + [f"/dev/dri/{renderDs[i]}" for i in ids]
 
-        # TGI specific
-        if self.task in TEXT_GENERATION_TASKS:
-            if self.trust_remote_code is None:
-                self.trust_remote_code = self.model_kwargs.get("trust_remote_code", False)
+        # Common options
+        if self.max_concurrent_requests is None:
+            if self.task in TEXT_GENERATION_TASKS:
+                self.max_concurrent_requests = 128
+            elif self.task in TEXT_EMBEDDING_TASKS:
+                self.max_concurrent_requests = 512

From e936705e57c447f17a1bb7a37449cd18c449250b Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Sun, 15 Dec 2024 18:02:05 +0100
Subject: [PATCH 04/30] test

---
 optimum_benchmark/backends/py_txi/backend.py | 6 +++---
 optimum_benchmark/backends/py_txi/config.py  | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/optimum_benchmark/backends/py_txi/backend.py b/optimum_benchmark/backends/py_txi/backend.py
index 1b02277a..184f2518 100644
--- a/optimum_benchmark/backends/py_txi/backend.py
+++ b/optimum_benchmark/backends/py_txi/backend.py
@@ -93,10 +93,10 @@ def create_no_weights_model(self) -> None:
             self.generation_config.save_pretrained(save_directory=self.no_weights_model)
 
     def load_model_with_no_weights(self) -> None:
-        original_volumes, self.config.volumes = self.config.volumes, {self.tmpdir.name: {"bind": "/data", "mode": "rw"}}
-        original_model, self.config.model = self.config.model, "/data/no_weights_model/"
+        self.config.volumes[self.tmpdir.name] = {"bind": "/no_weights_data/", "mode": "rw"}
+        original_model, self.config.model = self.config.model, "/no_weights_data/no_weights_model/"
         self.load_model_from_pretrained()
-        self.config.model, self.config.volumes = original_model, original_volumes
+        self.config.model = original_model
 
     def load_model_from_pretrained(self) -> None:
         if self.config.task in TEXT_GENERATION_TASKS:
diff --git a/optimum_benchmark/backends/py_txi/config.py b/optimum_benchmark/backends/py_txi/config.py
index 9cc23a64..bde8ab43 100644
--- a/optimum_benchmark/backends/py_txi/config.py
+++ b/optimum_benchmark/backends/py_txi/config.py
@@ -37,7 +37,7 @@ class PyTXIConfig(BackendConfig):
         metadata={"help": "Dictionary of volumes to mount inside the container."},
     )
     environment: List[str] = field(
-        default_factory=lambda: ["HUGGING_FACE_HUB_TOKEN"],
+        default_factory=lambda: ["HF_TOKEN"],
         metadata={"help": "List of environment variables to forward to the container from the host."},
     )
 

From 4289798baed85ab17e76946c5d7c7b93f9a7916a Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Sun, 15 Dec 2024 18:11:02 +0100
Subject: [PATCH 05/30] force txi sequential in cuda ci

---
 .github/workflows/test_cli_cuda_py_txi.yaml | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/test_cli_cuda_py_txi.yaml b/.github/workflows/test_cli_cuda_py_txi.yaml
index 5c090b28..b8c50db0 100644
--- a/.github/workflows/test_cli_cuda_py_txi.yaml
+++ b/.github/workflows/test_cli_cuda_py_txi.yaml
@@ -48,7 +48,8 @@ jobs:
           pip install -e .[testing,py-txi] git+https://github.com/IlyasMoutawwakil/py-txi.git
 
       - name: Run tests
-        run: pytest tests/test_cli.py -x -s -k "cli and cuda and py_txi"
+        run: |
+          FORCE_SEQUENTIAL=1 pytest tests/test_cli.py -x -s -k "cli and cuda and py_txi"
 
       - if: ${{
           (github.event_name == 'push') ||
@@ -56,4 +57,5 @@ jobs:
           contains( github.event.pull_request.labels.*.name, 'examples')
           }}
         name: Run examples
-        run: pytest tests/test_examples.py -x -s -k "cli and cuda and (tgi or tei)"
+        run: |
+          FORCE_SEQUENTIAL=1 pytest tests/test_examples.py -x -s -k "cli and cuda and (tgi or tei)"

From ecaa6c87ad151ea16df184e8046d77ba5bb5f2f8 Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Sun, 15 Dec 2024 18:46:29 +0100
Subject: [PATCH 06/30] test

---
 optimum_benchmark/backends/py_txi/backend.py  | 27 +++++++------------
 tests/configs/cpu_inference_py_txi_gpt2.yaml  |  3 +++
 tests/configs/cuda_inference_py_txi_gpt2.yaml |  3 +++
 3 files changed, 16 insertions(+), 17 deletions(-)

diff --git a/optimum_benchmark/backends/py_txi/backend.py b/optimum_benchmark/backends/py_txi/backend.py
index 184f2518..c56a4989 100644
--- a/optimum_benchmark/backends/py_txi/backend.py
+++ b/optimum_benchmark/backends/py_txi/backend.py
@@ -63,40 +63,33 @@ def prepare_generation_config(self) -> None:
 
     def create_no_weights_model(self) -> None:
         self.no_weights_model = os.path.join(self.tmpdir.name, "no_weights_model")
-        self.logger.info("\t+ Creating no weights model directory")
         os.makedirs(self.no_weights_model, exist_ok=True)
-        self.logger.info("\t+ Creating no weights model state dict")
         state_dict = torch.nn.Linear(1, 1).state_dict()
-        self.logger.info("\t+ Saving no weights model safetensors")
         safetensor = os.path.join(self.no_weights_model, "model.safetensors")
         save_file(tensors=state_dict, filename=safetensor, metadata={"format": "pt"})
-        self.logger.info("\t+ Saving no weights model pretrained config")
-        self.pretrained_config.save_pretrained(save_directory=self.no_weights_model)
-        self.logger.info("\t+ Saving no weights model pretrained processor")
-        self.pretrained_processor.save_pretrained(save_directory=self.no_weights_model)
-        # unlike Transformers, TXI won't accept any missing tensors so we need to materialize the model
-        self.logger.info(f"\t+ Loading no weights model from {self.no_weights_model}")
         with fast_weights_init():
+            # unlike Transformers, TXI won't accept any missing tensors so we need to materialize the model
             self.pretrained_model = self.automodel_loader.from_pretrained(
                 self.no_weights_model, **self.config.model_kwargs, device_map="auto", _fast_init=False
             )
-        self.logger.info("\t+ Saving no weights model")
-        self.pretrained_model.save_pretrained(save_directory=self.no_weights_model)
+        save_file(tensors=self.pretrained_model.state_dict(), filename=safetensor, metadata={"format": "pt"})
         del self.pretrained_model
         torch.cuda.empty_cache()
 
-        if self.config.task in TEXT_GENERATION_TASKS:
-            self.logger.info("\t+ Modifying generation config for fixed length generation")
+        if self.pretrained_config is not None:
+            self.pretrained_config.save_pretrained(save_directory=self.no_weights_model)
+        if self.pretrained_processor is not None:
+            self.pretrained_processor.save_pretrained(save_directory=self.no_weights_model)
+        if self.generation_config is not None:
             self.generation_config.eos_token_id = None
             self.generation_config.pad_token_id = None
-            self.logger.info("\t+ Saving new pretrained generation config")
             self.generation_config.save_pretrained(save_directory=self.no_weights_model)
 
     def load_model_with_no_weights(self) -> None:
-        self.config.volumes[self.tmpdir.name] = {"bind": "/no_weights_data/", "mode": "rw"}
-        original_model, self.config.model = self.config.model, "/no_weights_data/no_weights_model/"
+        self.config.volumes = (self.config.volumes, {self.tmpdir.name: {"bind": self.tmpdir.name, "mode": "rw"}})
+        original_model, self.config.model = self.config.model, self.no_weights_model
         self.load_model_from_pretrained()
-        self.config.model = original_model
+        self.config.model, self.config.volumes = original_model
 
     def load_model_from_pretrained(self) -> None:
         if self.config.task in TEXT_GENERATION_TASKS:
diff --git a/tests/configs/cpu_inference_py_txi_gpt2.yaml b/tests/configs/cpu_inference_py_txi_gpt2.yaml
index 1aef598e..82a522bd 100644
--- a/tests/configs/cpu_inference_py_txi_gpt2.yaml
+++ b/tests/configs/cpu_inference_py_txi_gpt2.yaml
@@ -9,3 +9,6 @@ defaults:
   - override backend: py-txi
 
 name: cpu_inference_py_txi_gpt2
+
+backend:
+  cuda_graphs: 0
diff --git a/tests/configs/cuda_inference_py_txi_gpt2.yaml b/tests/configs/cuda_inference_py_txi_gpt2.yaml
index 1c93ac36..d0d17dbc 100644
--- a/tests/configs/cuda_inference_py_txi_gpt2.yaml
+++ b/tests/configs/cuda_inference_py_txi_gpt2.yaml
@@ -9,3 +9,6 @@ defaults:
   - override backend: py-txi
 
 name: cuda_inference_py_txi_gpt2
+
+backend:
+  cuda_graphs: 0

From ea328024a95ad2fdc3143b39735442d300d5fb6e Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Mon, 16 Dec 2024 11:34:46 +0100
Subject: [PATCH 07/30] test

---
 optimum_benchmark/backends/py_txi/backend.py | 150 ++++++++++++-------
 optimum_benchmark/backends/py_txi/config.py  |  41 ++---
 2 files changed, 110 insertions(+), 81 deletions(-)

diff --git a/optimum_benchmark/backends/py_txi/backend.py b/optimum_benchmark/backends/py_txi/backend.py
index c56a4989..5afe1e1e 100644
--- a/optimum_benchmark/backends/py_txi/backend.py
+++ b/optimum_benchmark/backends/py_txi/backend.py
@@ -31,11 +31,6 @@ def load(self) -> None:
         else:
             self.logger.info("\t+ Downloading pretrained model")
             self.download_pretrained_model()
-
-            if self.config.task in TEXT_GENERATION_TASKS:
-                self.logger.info("\t+ Preparing generation config")
-                self.prepare_generation_config()
-
             self.logger.info("\t+ Loading pretrained model")
             self.load_model_from_pretrained()
 
@@ -50,29 +45,30 @@ def download_pretrained_model(self) -> None:
         with init_empty_weights(include_buffers=True):
             self.automodel_loader.from_pretrained(self.config.model, **self.config.model_kwargs, cache_dir=self.volume)
 
-    def prepare_generation_config(self) -> None:
-        self.generation_config.eos_token_id = None
-        self.generation_config.pad_token_id = None
-        model_cache_folder = f"models/{self.config.model}".replace("/", "--")
-        model_cache_path = f"{self.volume}/{model_cache_folder}"
-        snapshot_file = f"{model_cache_path}/refs/{self.config.model_kwargs.get('revision', 'main')}"
-        snapshot_ref = open(snapshot_file, "r").read().strip()
-        model_snapshot_path = f"{model_cache_path}/snapshots/{snapshot_ref}"
-        self.logger.info("\t+ Saving new pretrained generation config")
-        self.generation_config.save_pretrained(save_directory=model_snapshot_path)
+        if self.config.task in TEXT_GENERATION_TASKS:
+            self.logger.info("\t+ Preparing generation config")
+            self.generation_config.eos_token_id = None
+            self.generation_config.pad_token_id = None
+            model_cache_folder = f"models/{self.config.model}".replace("/", "--")
+            model_cache_path = f"{self.volume}/{model_cache_folder}"
+            snapshot_file = f"{model_cache_path}/refs/{self.config.model_kwargs.get('revision', 'main')}"
+            snapshot_ref = open(snapshot_file, "r").read().strip()
+            model_snapshot_path = f"{model_cache_path}/snapshots/{snapshot_ref}"
+            self.logger.info("\t+ Saving pretrained generation config")
+            self.generation_config.save_pretrained(save_directory=model_snapshot_path)
 
     def create_no_weights_model(self) -> None:
         self.no_weights_model = os.path.join(self.tmpdir.name, "no_weights_model")
+        filename = os.path.join(self.no_weights_model, "model.safetensors")
         os.makedirs(self.no_weights_model, exist_ok=True)
-        state_dict = torch.nn.Linear(1, 1).state_dict()
-        safetensor = os.path.join(self.no_weights_model, "model.safetensors")
-        save_file(tensors=state_dict, filename=safetensor, metadata={"format": "pt"})
+
+        save_file(tensors=torch.nn.Linear(1, 1).state_dict(), filename=filename, metadata={"format": "pt"})
         with fast_weights_init():
             # unlike Transformers, TXI won't accept any missing tensors so we need to materialize the model
             self.pretrained_model = self.automodel_loader.from_pretrained(
                 self.no_weights_model, **self.config.model_kwargs, device_map="auto", _fast_init=False
             )
-        save_file(tensors=self.pretrained_model.state_dict(), filename=safetensor, metadata={"format": "pt"})
+        save_file(tensors=self.pretrained_model.state_dict(), filename=filename, metadata={"format": "pt"})
         del self.pretrained_model
         torch.cuda.empty_cache()
 
@@ -80,56 +76,108 @@ def create_no_weights_model(self) -> None:
             self.pretrained_config.save_pretrained(save_directory=self.no_weights_model)
         if self.pretrained_processor is not None:
             self.pretrained_processor.save_pretrained(save_directory=self.no_weights_model)
-        if self.generation_config is not None:
+
+        if self.config.task in TEXT_GENERATION_TASKS:
             self.generation_config.eos_token_id = None
             self.generation_config.pad_token_id = None
             self.generation_config.save_pretrained(save_directory=self.no_weights_model)
 
     def load_model_with_no_weights(self) -> None:
-        self.config.volumes = (self.config.volumes, {self.tmpdir.name: {"bind": self.tmpdir.name, "mode": "rw"}})
+        self.config.volumes = {self.tmpdir.name: {"bind": self.tmpdir.name, "mode": "rw"}}
         original_model, self.config.model = self.config.model, self.no_weights_model
         self.load_model_from_pretrained()
-        self.config.model, self.config.volumes = original_model
+        self.config.model = original_model
 
     def load_model_from_pretrained(self) -> None:
         if self.config.task in TEXT_GENERATION_TASKS:
             self.pretrained_model = TGI(
-                config=TGIConfig(
-                    model_id=self.config.model,
-                    gpus=self.config.gpus,
-                    devices=self.config.devices,
-                    volumes=self.config.volumes,
-                    environment=self.config.environment,
-                    ports=self.config.ports,
-                    dtype=self.config.dtype,
-                    sharded=self.config.sharded,
-                    quantize=self.config.quantize,
-                    num_shard=self.config.num_shard,
-                    speculate=self.config.speculate,
-                    cuda_graphs=self.config.cuda_graphs,
-                    disable_custom_kernels=self.config.disable_custom_kernels,
-                    trust_remote_code=self.config.trust_remote_code,
-                    max_concurrent_requests=self.config.max_concurrent_requests,
-                ),
+                config=TGIConfig(self.config.model, **self.txi_kwargs, **self.tgi_kwargs),
             )
-
         elif self.config.task in TEXT_EMBEDDING_TASKS:
             self.pretrained_model = TEI(
-                config=TEIConfig(
-                    model_id=self.config.model,
-                    gpus=self.config.gpus,
-                    devices=self.config.devices,
-                    volumes=self.config.volumes,
-                    environment=self.config.environment,
-                    ports=self.config.ports,
-                    dtype=self.config.dtype,
-                    pooling=self.config.pooling,
-                    max_concurrent_requests=self.config.max_concurrent_requests,
-                ),
+                config=TEIConfig(self.config.model, **self.txi_kwargs, **self.tei_kwargs),
             )
         else:
             raise NotImplementedError(f"TXI does not support task {self.config.task}")
 
+    @property
+    def txi_kwargs(self):
+        kwargs = {}
+
+        if self.config.gpus is not None:
+            kwargs["gpus"] = self.config.gpus
+
+        if self.config.image is not None:
+            kwargs["image"] = self.config.image
+
+        if self.config.ports is not None:
+            kwargs["ports"] = self.config.ports
+
+        if self.config.volumes is not None:
+            kwargs["volumes"] = self.config.volumes
+
+        if self.config.devices is not None:
+            kwargs["devices"] = self.config.devices
+
+        if self.config.shm_size is not None:
+            kwargs["shm_size"] = self.config.shm_size
+
+        if self.config.environment is not None:
+            kwargs["environment"] = self.config.environment
+
+        if self.config.connection_timeout is not None:
+            kwargs["connection_timeout"] = self.config.connection_timeout
+
+        if self.config.first_request_timeout is not None:
+            kwargs["first_request_timeout"] = self.config.first_request_timeout
+
+        if self.config.max_concurrent_requests is not None:
+            kwargs["max_concurrent_requests"] = self.config.max_concurrent_requests
+
+        return kwargs
+
+    @property
+    def tei_kwargs(self):
+        kwargs = {}
+
+        if self.config.dtype is not None:
+            kwargs["dtype"] = self.config.dtype
+
+        if self.config.pooling is not None:
+            kwargs["pooling"] = self.config.pooling
+
+        return kwargs
+
+    @property
+    def tgi_kwargs(self):
+        kwargs = {}
+
+        if self.config.dtype is not None:
+            kwargs["dtype"] = self.config.dtype
+
+        if self.config.sharded is not None:
+            kwargs["sharded"] = self.config.sharded
+
+        if self.config.quantize is not None:
+            kwargs["quantize"] = self.config.quantize
+
+        if self.config.num_shard is not None:
+            kwargs["num_shard"] = self.config.num_shard
+
+        if self.config.speculate is not None:
+            kwargs["speculate"] = self.config.speculate
+
+        if self.config.cuda_graphs is not None:
+            kwargs["cuda_graphs"] = self.config.cuda_graphs
+
+        if self.config.trust_remote_code is not None:
+            kwargs["trust_remote_code"] = self.config.trust_remote_code
+
+        if self.config.disable_custom_kernels is not None:
+            kwargs["disable_custom_kernels"] = self.config.disable_custom_kernels
+
+        return kwargs
+
     def prepare_inputs(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
         if self.config.task in TEXT_GENERATION_TASKS:
             inputs = {"prompt": self.pretrained_processor.batch_decode(inputs["input_ids"].tolist())}
diff --git a/optimum_benchmark/backends/py_txi/config.py b/optimum_benchmark/backends/py_txi/config.py
index bde8ab43..3b4a908d 100644
--- a/optimum_benchmark/backends/py_txi/config.py
+++ b/optimum_benchmark/backends/py_txi/config.py
@@ -1,9 +1,7 @@
 import os
-from dataclasses import dataclass, field
+from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Union
 
-from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
-
 from ...import_utils import py_txi_version
 from ...system_utils import is_nvidia_system, is_rocm_system
 from ...task_utils import TEXT_EMBEDDING_TASKS, TEXT_GENERATION_TASKS
@@ -16,7 +14,7 @@ class PyTXIConfig(BackendConfig):
     version: Optional[str] = py_txi_version()
     _target_: str = "optimum_benchmark.backends.py_txi.backend.PyTXIBackend"
 
-    # optimum benchmark specific
+    # optimum-benchmark specific
     no_weights: bool = False
 
     # Image to use for the container
@@ -28,27 +26,18 @@ class PyTXIConfig(BackendConfig):
     # NVIDIA-docker GPU device options e.g. "all" (all) or "0,1,2,3" (ids) or 4 (count)
     gpus: Optional[Union[str, int]] = None
     # Things to forward to the container
-    ports: Dict[str, Any] = field(
-        default_factory=lambda: {"80/tcp": ("127.0.0.1", 0)},
-        metadata={"help": "Dictionary of ports to expose from the container."},
-    )
-    volumes: Dict[str, Any] = field(
-        default_factory=lambda: {HUGGINGFACE_HUB_CACHE: {"bind": "/data", "mode": "rw"}},
-        metadata={"help": "Dictionary of volumes to mount inside the container."},
-    )
-    environment: List[str] = field(
-        default_factory=lambda: ["HF_TOKEN"],
-        metadata={"help": "List of environment variables to forward to the container from the host."},
-    )
-
-    # first connection/request
-    connection_timeout: int = 60
-    first_request_timeout: int = 60
+    ports: Optional[Dict[str, Any]] = None
+    environment: Optional[List[str]] = None
+    volumes: Optional[Dict[str, Any]] = None
+    # First connection/request
+    connection_timeout: Optional[int] = None
+    first_request_timeout: Optional[int] = None
     max_concurrent_requests: Optional[int] = None
 
     # Common options
     dtype: Optional[str] = None
-
+    # TEI specific
+    pooling: Optional[str] = None
     # TGI specific
     sharded: Optional[str] = None
     quantize: Optional[str] = None
@@ -58,9 +47,6 @@ class PyTXIConfig(BackendConfig):
     trust_remote_code: Optional[bool] = None
     disable_custom_kernels: Optional[bool] = None
 
-    # TEI specific
-    pooling: Optional[str] = None
-
     def __post_init__(self):
         super().__post_init__()
 
@@ -76,9 +62,4 @@ def __post_init__(self):
             renderDs = [file for file in os.listdir("/dev/dri") if file.startswith("renderD")]
             self.devices = ["/dev/kfd"] + [f"/dev/dri/{renderDs[i]}" for i in ids]
 
-        # Common options
-        if self.max_concurrent_requests is None:
-            if self.task in TEXT_GENERATION_TASKS:
-                self.max_concurrent_requests = 128
-            elif self.task in TEXT_EMBEDDING_TASKS:
-                self.max_concurrent_requests = 512
+        self.trust_remote_code = self.model_kwargs.get("trust_remote_code", None)

From abec2b6a37ff0dc95ad2486ddf174e05c00e741e Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Mon, 16 Dec 2024 11:50:17 +0100
Subject: [PATCH 08/30] test

---
 optimum_benchmark/backends/py_txi/backend.py | 19 +++----------------
 1 file changed, 3 insertions(+), 16 deletions(-)

diff --git a/optimum_benchmark/backends/py_txi/backend.py b/optimum_benchmark/backends/py_txi/backend.py
index 5afe1e1e..81f1f6c3 100644
--- a/optimum_benchmark/backends/py_txi/backend.py
+++ b/optimum_benchmark/backends/py_txi/backend.py
@@ -3,7 +3,7 @@
 from typing import Any, Dict, List
 
 import torch
-from accelerate import init_empty_weights
+from huggingface_hub import snapshot_download
 from py_txi import TEI, TGI, TEIConfig, TGIConfig
 from safetensors.torch import save_file
 
@@ -36,26 +36,13 @@ def load(self) -> None:
 
         self.tmpdir.cleanup()
 
-    @property
-    def volume(self) -> str:
-        return list(self.config.volumes.keys())[0]
-
     def download_pretrained_model(self) -> None:
-        # directly downloads pretrained model in volume (/data) to change generation config before loading model
-        with init_empty_weights(include_buffers=True):
-            self.automodel_loader.from_pretrained(self.config.model, **self.config.model_kwargs, cache_dir=self.volume)
+        model_snapshot_folder = snapshot_download(self.config.model, self.config.model_kwargs)
 
         if self.config.task in TEXT_GENERATION_TASKS:
-            self.logger.info("\t+ Preparing generation config")
             self.generation_config.eos_token_id = None
             self.generation_config.pad_token_id = None
-            model_cache_folder = f"models/{self.config.model}".replace("/", "--")
-            model_cache_path = f"{self.volume}/{model_cache_folder}"
-            snapshot_file = f"{model_cache_path}/refs/{self.config.model_kwargs.get('revision', 'main')}"
-            snapshot_ref = open(snapshot_file, "r").read().strip()
-            model_snapshot_path = f"{model_cache_path}/snapshots/{snapshot_ref}"
-            self.logger.info("\t+ Saving pretrained generation config")
-            self.generation_config.save_pretrained(save_directory=model_snapshot_path)
+            self.generation_config.save_pretrained(save_directory=model_snapshot_folder)
 
     def create_no_weights_model(self) -> None:
         self.no_weights_model = os.path.join(self.tmpdir.name, "no_weights_model")

From 57680a3aae06595dd0b67ee315aed55b58080f36 Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Mon, 16 Dec 2024 11:51:00 +0100
Subject: [PATCH 09/30] test no weights only

---
 tests/configs/_no_weights_.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/configs/_no_weights_.yaml b/tests/configs/_no_weights_.yaml
index 31bbf2eb..bb0afa43 100644
--- a/tests/configs/_no_weights_.yaml
+++ b/tests/configs/_no_weights_.yaml
@@ -2,4 +2,4 @@ hydra:
   mode: MULTIRUN
   sweeper:
     params:
-      backend.no_weights: true,false
+      backend.no_weights: true

From 31f96ba7d9d99dbe36a2dbe0f780ce607ae7fd3a Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Mon, 16 Dec 2024 11:56:40 +0100
Subject: [PATCH 10/30] fix

---
 optimum_benchmark/backends/py_txi/backend.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/optimum_benchmark/backends/py_txi/backend.py b/optimum_benchmark/backends/py_txi/backend.py
index 81f1f6c3..e4c00f27 100644
--- a/optimum_benchmark/backends/py_txi/backend.py
+++ b/optimum_benchmark/backends/py_txi/backend.py
@@ -46,9 +46,14 @@ def download_pretrained_model(self) -> None:
 
     def create_no_weights_model(self) -> None:
         self.no_weights_model = os.path.join(self.tmpdir.name, "no_weights_model")
-        filename = os.path.join(self.no_weights_model, "model.safetensors")
         os.makedirs(self.no_weights_model, exist_ok=True)
 
+        if self.pretrained_config is not None:
+            self.pretrained_config.save_pretrained(save_directory=self.no_weights_model)
+        if self.pretrained_processor is not None:
+            self.pretrained_processor.save_pretrained(save_directory=self.no_weights_model)
+
+        filename = os.path.join(self.no_weights_model, "model.safetensors")
         save_file(tensors=torch.nn.Linear(1, 1).state_dict(), filename=filename, metadata={"format": "pt"})
         with fast_weights_init():
             # unlike Transformers, TXI won't accept any missing tensors so we need to materialize the model
@@ -59,11 +64,6 @@ def create_no_weights_model(self) -> None:
         del self.pretrained_model
         torch.cuda.empty_cache()
 
-        if self.pretrained_config is not None:
-            self.pretrained_config.save_pretrained(save_directory=self.no_weights_model)
-        if self.pretrained_processor is not None:
-            self.pretrained_processor.save_pretrained(save_directory=self.no_weights_model)
-
         if self.config.task in TEXT_GENERATION_TASKS:
             self.generation_config.eos_token_id = None
             self.generation_config.pad_token_id = None
@@ -78,11 +78,11 @@ def load_model_with_no_weights(self) -> None:
     def load_model_from_pretrained(self) -> None:
         if self.config.task in TEXT_GENERATION_TASKS:
             self.pretrained_model = TGI(
-                config=TGIConfig(self.config.model, **self.txi_kwargs, **self.tgi_kwargs),
+                config=TGIConfig(model_id=self.config.model, **self.txi_kwargs, **self.tgi_kwargs),
             )
         elif self.config.task in TEXT_EMBEDDING_TASKS:
             self.pretrained_model = TEI(
-                config=TEIConfig(self.config.model, **self.txi_kwargs, **self.tei_kwargs),
+                config=TEIConfig(model_id=self.config.model, **self.txi_kwargs, **self.tei_kwargs),
             )
         else:
             raise NotImplementedError(f"TXI does not support task {self.config.task}")

From 8c8d073670a817f46b20590fd0ec24b63cf1fbd8 Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Mon, 16 Dec 2024 12:02:57 +0100
Subject: [PATCH 11/30] test

---
 optimum_benchmark/backends/py_txi/backend.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/optimum_benchmark/backends/py_txi/backend.py b/optimum_benchmark/backends/py_txi/backend.py
index e4c00f27..ba542a2f 100644
--- a/optimum_benchmark/backends/py_txi/backend.py
+++ b/optimum_benchmark/backends/py_txi/backend.py
@@ -5,7 +5,7 @@
 import torch
 from huggingface_hub import snapshot_download
 from py_txi import TEI, TGI, TEIConfig, TGIConfig
-from safetensors.torch import save_file
+from safetensors.torch import save_file, save_model
 
 from ...task_utils import TEXT_EMBEDDING_TASKS, TEXT_GENERATION_TASKS
 from ..base import Backend
@@ -60,7 +60,7 @@ def create_no_weights_model(self) -> None:
             self.pretrained_model = self.automodel_loader.from_pretrained(
                 self.no_weights_model, **self.config.model_kwargs, device_map="auto", _fast_init=False
             )
-        save_file(tensors=self.pretrained_model.state_dict(), filename=filename, metadata={"format": "pt"})
+        self.pretrained_model.save_pretrained(save_directory=self.no_weights_model)
         del self.pretrained_model
         torch.cuda.empty_cache()
 

From bc1c638f2251689a88a1d3f6077959db73f5ebd5 Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Mon, 16 Dec 2024 12:32:35 +0100
Subject: [PATCH 12/30] test

---
 optimum_benchmark/backends/py_txi/backend.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/optimum_benchmark/backends/py_txi/backend.py b/optimum_benchmark/backends/py_txi/backend.py
index ba542a2f..96a80b6c 100644
--- a/optimum_benchmark/backends/py_txi/backend.py
+++ b/optimum_benchmark/backends/py_txi/backend.py
@@ -48,13 +48,10 @@ def create_no_weights_model(self) -> None:
         self.no_weights_model = os.path.join(self.tmpdir.name, "no_weights_model")
         os.makedirs(self.no_weights_model, exist_ok=True)
 
-        if self.pretrained_config is not None:
-            self.pretrained_config.save_pretrained(save_directory=self.no_weights_model)
-        if self.pretrained_processor is not None:
-            self.pretrained_processor.save_pretrained(save_directory=self.no_weights_model)
-
         filename = os.path.join(self.no_weights_model, "model.safetensors")
         save_file(tensors=torch.nn.Linear(1, 1).state_dict(), filename=filename, metadata={"format": "pt"})
+        self.pretrained_processor.save_pretrained(save_directory=self.no_weights_model)
+        self.pretrained_config.save_pretrained(save_directory=self.no_weights_model)
         with fast_weights_init():
             # unlike Transformers, TXI won't accept any missing tensors so we need to materialize the model
             self.pretrained_model = self.automodel_loader.from_pretrained(
@@ -70,8 +67,8 @@ def create_no_weights_model(self) -> None:
             self.generation_config.save_pretrained(save_directory=self.no_weights_model)
 
     def load_model_with_no_weights(self) -> None:
-        self.config.volumes = {self.tmpdir.name: {"bind": self.tmpdir.name, "mode": "rw"}}
-        original_model, self.config.model = self.config.model, self.no_weights_model
+        self.config.volumes = {self.tmpdir.name: {"bind": "/var/no_weights_folder", "mode": "rw"}}
+        original_model, self.config.model = self.config.model, "/var/no_weights_folder/no_weights_model"
         self.load_model_from_pretrained()
         self.config.model = original_model
 

From 95d45e590f39bdf14031743e98770302d01ed570 Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Mon, 16 Dec 2024 12:52:04 +0100
Subject: [PATCH 13/30] test again

---
 optimum_benchmark/backends/py_txi/backend.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/optimum_benchmark/backends/py_txi/backend.py b/optimum_benchmark/backends/py_txi/backend.py
index 96a80b6c..a3341bcf 100644
--- a/optimum_benchmark/backends/py_txi/backend.py
+++ b/optimum_benchmark/backends/py_txi/backend.py
@@ -67,8 +67,8 @@ def create_no_weights_model(self) -> None:
             self.generation_config.save_pretrained(save_directory=self.no_weights_model)
 
     def load_model_with_no_weights(self) -> None:
-        self.config.volumes = {self.tmpdir.name: {"bind": "/var/no_weights_folder", "mode": "rw"}}
-        original_model, self.config.model = self.config.model, "/var/no_weights_folder/no_weights_model"
+        self.config.volumes = {self.tmpdir.name: {"bind": "/data", "mode": "rw"}}
+        original_model, self.config.model = self.config.model, "/data/no_weights_model/"
         self.load_model_from_pretrained()
         self.config.model = original_model
 

From 63ef40757baee4c5c82b48a6465fc1b76e814a92 Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Mon, 16 Dec 2024 13:13:17 +0100
Subject: [PATCH 14/30] test

---
 optimum_benchmark/backends/py_txi/backend.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/optimum_benchmark/backends/py_txi/backend.py b/optimum_benchmark/backends/py_txi/backend.py
index a3341bcf..a4c085eb 100644
--- a/optimum_benchmark/backends/py_txi/backend.py
+++ b/optimum_benchmark/backends/py_txi/backend.py
@@ -4,8 +4,9 @@
 
 import torch
 from huggingface_hub import snapshot_download
+from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
 from py_txi import TEI, TGI, TEIConfig, TGIConfig
-from safetensors.torch import save_file, save_model
+from safetensors.torch import save_file
 
 from ...task_utils import TEXT_EMBEDDING_TASKS, TEXT_GENERATION_TASKS
 from ..base import Backend
@@ -67,8 +68,11 @@ def create_no_weights_model(self) -> None:
             self.generation_config.save_pretrained(save_directory=self.no_weights_model)
 
     def load_model_with_no_weights(self) -> None:
-        self.config.volumes = {self.tmpdir.name: {"bind": "/data", "mode": "rw"}}
-        original_model, self.config.model = self.config.model, "/data/no_weights_model/"
+        self.config.volumes = {
+            HUGGINGFACE_HUB_CACHE: {"bind": "/data", "mode": "rw"},
+            self.tmpdir.name: {"bind": "/no_weights_folder", "mode": "rw"},
+        }
+        original_model, self.config.model = self.config.model, "/no_weights_folder/no_weights_model/"
         self.load_model_from_pretrained()
         self.config.model = original_model
 

From 1f9fdd661f2dc9aa2ab258c35151fa13d6c992eb Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Mon, 16 Dec 2024 14:04:12 +0100
Subject: [PATCH 15/30] test

---
 optimum_benchmark/backends/py_txi/backend.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/optimum_benchmark/backends/py_txi/backend.py b/optimum_benchmark/backends/py_txi/backend.py
index a4c085eb..7ee11b7e 100644
--- a/optimum_benchmark/backends/py_txi/backend.py
+++ b/optimum_benchmark/backends/py_txi/backend.py
@@ -38,7 +38,7 @@ def load(self) -> None:
         self.tmpdir.cleanup()
 
     def download_pretrained_model(self) -> None:
-        model_snapshot_folder = snapshot_download(self.config.model, self.config.model_kwargs)
+        model_snapshot_folder = snapshot_download(self.config.model, **self.config.model_kwargs)
 
         if self.config.task in TEXT_GENERATION_TASKS:
             self.generation_config.eos_token_id = None
@@ -68,11 +68,8 @@ def create_no_weights_model(self) -> None:
             self.generation_config.save_pretrained(save_directory=self.no_weights_model)
 
     def load_model_with_no_weights(self) -> None:
-        self.config.volumes = {
-            HUGGINGFACE_HUB_CACHE: {"bind": "/data", "mode": "rw"},
-            self.tmpdir.name: {"bind": "/no_weights_folder", "mode": "rw"},
-        }
-        original_model, self.config.model = self.config.model, "/no_weights_folder/no_weights_model/"
+        self.config.volumes = {self.no_weights_model: {"bind": "/no_weights_model/", "mode": "rw"}}
+        original_model, self.config.model = self.config.model, "/no_weights_model/"
         self.load_model_from_pretrained()
         self.config.model = original_model
 

From 2c8808023f6cd52922e120c1bf9abf5820c00460 Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Mon, 16 Dec 2024 14:13:19 +0100
Subject: [PATCH 16/30] disable safe ser

---
 optimum_benchmark/backends/py_txi/backend.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/optimum_benchmark/backends/py_txi/backend.py b/optimum_benchmark/backends/py_txi/backend.py
index 7ee11b7e..83dcd6cb 100644
--- a/optimum_benchmark/backends/py_txi/backend.py
+++ b/optimum_benchmark/backends/py_txi/backend.py
@@ -4,7 +4,6 @@
 
 import torch
 from huggingface_hub import snapshot_download
-from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
 from py_txi import TEI, TGI, TEIConfig, TGIConfig
 from safetensors.torch import save_file
 
@@ -58,7 +57,7 @@ def create_no_weights_model(self) -> None:
             self.pretrained_model = self.automodel_loader.from_pretrained(
                 self.no_weights_model, **self.config.model_kwargs, device_map="auto", _fast_init=False
             )
-        self.pretrained_model.save_pretrained(save_directory=self.no_weights_model)
+        self.pretrained_model.save_pretrained(save_directory=self.no_weights_model, safe_serialization=False)
         del self.pretrained_model
         torch.cuda.empty_cache()
 

From 4dd46d1228ca3b66a7c99af4f1df9194c0143d8e Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Mon, 16 Dec 2024 14:31:54 +0100
Subject: [PATCH 17/30] test

---
 optimum_benchmark/backends/py_txi/backend.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/optimum_benchmark/backends/py_txi/backend.py b/optimum_benchmark/backends/py_txi/backend.py
index 83dcd6cb..12d9d356 100644
--- a/optimum_benchmark/backends/py_txi/backend.py
+++ b/optimum_benchmark/backends/py_txi/backend.py
@@ -5,7 +5,7 @@
 import torch
 from huggingface_hub import snapshot_download
 from py_txi import TEI, TGI, TEIConfig, TGIConfig
-from safetensors.torch import save_file
+from safetensors.torch import save_file, save_model
 
 from ...task_utils import TEXT_EMBEDDING_TASKS, TEXT_GENERATION_TASKS
 from ..base import Backend
@@ -48,24 +48,24 @@ def create_no_weights_model(self) -> None:
         self.no_weights_model = os.path.join(self.tmpdir.name, "no_weights_model")
         os.makedirs(self.no_weights_model, exist_ok=True)
 
+        self.pretrained_config.save_pretrained(save_directory=self.no_weights_model)
+        self.pretrained_processor.save_pretrained(save_directory=self.no_weights_model)
+        if self.config.task in TEXT_GENERATION_TASKS:
+            self.generation_config.eos_token_id = None
+            self.generation_config.pad_token_id = None
+            self.generation_config.save_pretrained(save_directory=self.no_weights_model)
+
         filename = os.path.join(self.no_weights_model, "model.safetensors")
         save_file(tensors=torch.nn.Linear(1, 1).state_dict(), filename=filename, metadata={"format": "pt"})
-        self.pretrained_processor.save_pretrained(save_directory=self.no_weights_model)
-        self.pretrained_config.save_pretrained(save_directory=self.no_weights_model)
         with fast_weights_init():
             # unlike Transformers, TXI won't accept any missing tensors so we need to materialize the model
             self.pretrained_model = self.automodel_loader.from_pretrained(
                 self.no_weights_model, **self.config.model_kwargs, device_map="auto", _fast_init=False
             )
-        self.pretrained_model.save_pretrained(save_directory=self.no_weights_model, safe_serialization=False)
+        save_model(tensors=self.pretrained_model, filename=filename, metadata={"format": "pt"})
         del self.pretrained_model
         torch.cuda.empty_cache()
 
-        if self.config.task in TEXT_GENERATION_TASKS:
-            self.generation_config.eos_token_id = None
-            self.generation_config.pad_token_id = None
-            self.generation_config.save_pretrained(save_directory=self.no_weights_model)
-
     def load_model_with_no_weights(self) -> None:
         self.config.volumes = {self.no_weights_model: {"bind": "/no_weights_model/", "mode": "rw"}}
         original_model, self.config.model = self.config.model, "/no_weights_model/"

From 75945ac87e466cbe80bdc8e76346cc83d1aba596 Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Mon, 16 Dec 2024 15:10:59 +0100
Subject: [PATCH 18/30] fix

---
 optimum_benchmark/backends/py_txi/backend.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/optimum_benchmark/backends/py_txi/backend.py b/optimum_benchmark/backends/py_txi/backend.py
index 12d9d356..605abdf2 100644
--- a/optimum_benchmark/backends/py_txi/backend.py
+++ b/optimum_benchmark/backends/py_txi/backend.py
@@ -5,7 +5,7 @@
 import torch
 from huggingface_hub import snapshot_download
 from py_txi import TEI, TGI, TEIConfig, TGIConfig
-from safetensors.torch import save_file, save_model
+from safetensors.torch import save_model
 
 from ...task_utils import TEXT_EMBEDDING_TASKS, TEXT_GENERATION_TASKS
 from ..base import Backend
@@ -56,13 +56,13 @@ def create_no_weights_model(self) -> None:
             self.generation_config.save_pretrained(save_directory=self.no_weights_model)
 
         filename = os.path.join(self.no_weights_model, "model.safetensors")
-        save_file(tensors=torch.nn.Linear(1, 1).state_dict(), filename=filename, metadata={"format": "pt"})
+        save_model(tensors=torch.nn.Linear(1, 1), filename=filename, metadata={"format": "pt"})
         with fast_weights_init():
             # unlike Transformers, TXI won't accept any missing tensors so we need to materialize the model
             self.pretrained_model = self.automodel_loader.from_pretrained(
                 self.no_weights_model, **self.config.model_kwargs, device_map="auto", _fast_init=False
             )
-        save_model(tensors=self.pretrained_model, filename=filename, metadata={"format": "pt"})
+        save_model(model=self.pretrained_model, filename=filename, metadata={"format": "pt"})
         del self.pretrained_model
         torch.cuda.empty_cache()
 

From bd542899cedcea9facbc3974b7bf29280cede6c3 Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Mon, 16 Dec 2024 15:13:06 +0100
Subject: [PATCH 19/30] faster req installation

---
 .github/workflows/test_cli_cpu_py_txi.yaml   | 9 ++++++---
 .github/workflows/test_cli_cuda_py_txi.yaml  | 8 ++++++--
 optimum_benchmark/backends/py_txi/backend.py | 2 +-
 3 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/test_cli_cpu_py_txi.yaml b/.github/workflows/test_cli_cpu_py_txi.yaml
index 7b1946e7..06bd841d 100644
--- a/.github/workflows/test_cli_cpu_py_txi.yaml
+++ b/.github/workflows/test_cli_cpu_py_txi.yaml
@@ -43,9 +43,12 @@ jobs:
 
       - name: Install requirements
         run: |
-          pip install --upgrade pip
-          pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
-          pip install -e .[testing,py-txi] git+https://github.com/IlyasMoutawwakil/py-txi.git
+          pip install uv
+          uv pip install --upgrade pip
+          uv pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
+          uv pip install -e .[testing,py-txi] git+https://github.com/IlyasMoutawwakil/py-txi.git
+        env:
+          UV_SYSTEM_PYTHON: 1
 
       - name: Run tests
         run: pytest tests/test_cli.py -s -k "cli and cpu and py_txi"
diff --git a/.github/workflows/test_cli_cuda_py_txi.yaml b/.github/workflows/test_cli_cuda_py_txi.yaml
index b8c50db0..a7fe9a51 100644
--- a/.github/workflows/test_cli_cuda_py_txi.yaml
+++ b/.github/workflows/test_cli_cuda_py_txi.yaml
@@ -44,8 +44,12 @@ jobs:
 
       - name: Install requirements
         run: |
-          pip install --upgrade pip
-          pip install -e .[testing,py-txi] git+https://github.com/IlyasMoutawwakil/py-txi.git
+          pip install uv
+          uv pip install --upgrade pip
+          uv pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
+          uv pip install -e .[testing,py-txi] git+https://github.com/IlyasMoutawwakil/py-txi.git
+        env:
+          UV_SYSTEM_PYTHON: 1
 
       - name: Run tests
         run: |
diff --git a/optimum_benchmark/backends/py_txi/backend.py b/optimum_benchmark/backends/py_txi/backend.py
index 605abdf2..0f52bc50 100644
--- a/optimum_benchmark/backends/py_txi/backend.py
+++ b/optimum_benchmark/backends/py_txi/backend.py
@@ -56,7 +56,7 @@ def create_no_weights_model(self) -> None:
             self.generation_config.save_pretrained(save_directory=self.no_weights_model)
 
         filename = os.path.join(self.no_weights_model, "model.safetensors")
-        save_model(tensors=torch.nn.Linear(1, 1), filename=filename, metadata={"format": "pt"})
+        save_model(model=torch.nn.Linear(1, 1), filename=filename, metadata={"format": "pt"})
         with fast_weights_init():
             # unlike Transformers, TXI won't accept any missing tensors so we need to materialize the model
             self.pretrained_model = self.automodel_loader.from_pretrained(

From 3b0138b30cb884475724932f27ab922b751dbee1 Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Mon, 16 Dec 2024 15:23:29 +0100
Subject: [PATCH 20/30] use older tgi version

---
 examples/cuda_tgi_llama.yaml                  | 1 +
 tests/configs/_no_weights_.yaml               | 2 +-
 tests/configs/cpu_inference_py_txi_gpt2.yaml  | 1 +
 tests/configs/cuda_inference_py_txi_gpt2.yaml | 1 +
 4 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/examples/cuda_tgi_llama.yaml b/examples/cuda_tgi_llama.yaml
index a32060b1..a3d33af5 100644
--- a/examples/cuda_tgi_llama.yaml
+++ b/examples/cuda_tgi_llama.yaml
@@ -18,6 +18,7 @@ backend:
   cuda_graphs: 0 # remove for better perf but bigger memory footprint
   no_weights: true
   model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
+  image: ghcr.io/huggingface/text-generation-inference:2.4.1
 
 scenario:
   input_shapes:
diff --git a/tests/configs/_no_weights_.yaml b/tests/configs/_no_weights_.yaml
index bb0afa43..31bbf2eb 100644
--- a/tests/configs/_no_weights_.yaml
+++ b/tests/configs/_no_weights_.yaml
@@ -2,4 +2,4 @@ hydra:
   mode: MULTIRUN
   sweeper:
     params:
-      backend.no_weights: true
+      backend.no_weights: true,false
diff --git a/tests/configs/cpu_inference_py_txi_gpt2.yaml b/tests/configs/cpu_inference_py_txi_gpt2.yaml
index 82a522bd..23c18416 100644
--- a/tests/configs/cpu_inference_py_txi_gpt2.yaml
+++ b/tests/configs/cpu_inference_py_txi_gpt2.yaml
@@ -12,3 +12,4 @@ name: cpu_inference_py_txi_gpt2
 
 backend:
   cuda_graphs: 0
+  image: ghcr.io/huggingface/text-generation-inference:2.4.1
diff --git a/tests/configs/cuda_inference_py_txi_gpt2.yaml b/tests/configs/cuda_inference_py_txi_gpt2.yaml
index d0d17dbc..75e20094 100644
--- a/tests/configs/cuda_inference_py_txi_gpt2.yaml
+++ b/tests/configs/cuda_inference_py_txi_gpt2.yaml
@@ -12,3 +12,4 @@ name: cuda_inference_py_txi_gpt2
 
 backend:
   cuda_graphs: 0
+  image: ghcr.io/huggingface/text-generation-inference:2.4.1

From a8c41596d75cafcda5f676bb38d1ce8ab150ac35 Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Mon, 16 Dec 2024 17:18:26 +0100
Subject: [PATCH 21/30] disable no weights on tgi cuda for now

---
 examples/cuda_tgi_llama.yaml                  |  3 +--
 optimum_benchmark/backends/py_txi/backend.py  | 11 ++++++-----
 tests/configs/cpu_inference_py_txi_gpt2.yaml  |  4 ----
 tests/configs/cuda_inference_py_txi_gpt2.yaml |  5 -----
 4 files changed, 7 insertions(+), 16 deletions(-)

diff --git a/examples/cuda_tgi_llama.yaml b/examples/cuda_tgi_llama.yaml
index a3d33af5..16d2f5f0 100644
--- a/examples/cuda_tgi_llama.yaml
+++ b/examples/cuda_tgi_llama.yaml
@@ -16,9 +16,8 @@ backend:
   device: cuda
   device_ids: 0
   cuda_graphs: 0 # remove for better perf but bigger memory footprint
-  no_weights: true
+  no_weights: false
   model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
-  image: ghcr.io/huggingface/text-generation-inference:2.4.1
 
 scenario:
   input_shapes:
diff --git a/optimum_benchmark/backends/py_txi/backend.py b/optimum_benchmark/backends/py_txi/backend.py
index 0f52bc50..00e1044d 100644
--- a/optimum_benchmark/backends/py_txi/backend.py
+++ b/optimum_benchmark/backends/py_txi/backend.py
@@ -46,16 +46,12 @@ def download_pretrained_model(self) -> None:
 
     def create_no_weights_model(self) -> None:
         self.no_weights_model = os.path.join(self.tmpdir.name, "no_weights_model")
+        filename = os.path.join(self.no_weights_model, "model.safetensors")
         os.makedirs(self.no_weights_model, exist_ok=True)
 
         self.pretrained_config.save_pretrained(save_directory=self.no_weights_model)
         self.pretrained_processor.save_pretrained(save_directory=self.no_weights_model)
-        if self.config.task in TEXT_GENERATION_TASKS:
-            self.generation_config.eos_token_id = None
-            self.generation_config.pad_token_id = None
-            self.generation_config.save_pretrained(save_directory=self.no_weights_model)
 
-        filename = os.path.join(self.no_weights_model, "model.safetensors")
         save_model(model=torch.nn.Linear(1, 1), filename=filename, metadata={"format": "pt"})
         with fast_weights_init():
             # unlike Transformers, TXI won't accept any missing tensors so we need to materialize the model
@@ -66,6 +62,11 @@ def create_no_weights_model(self) -> None:
         del self.pretrained_model
         torch.cuda.empty_cache()
 
+        if self.config.task in TEXT_GENERATION_TASKS:
+            self.generation_config.eos_token_id = None
+            self.generation_config.pad_token_id = None
+            self.generation_config.save_pretrained(save_directory=self.no_weights_model)
+
     def load_model_with_no_weights(self) -> None:
         self.config.volumes = {self.no_weights_model: {"bind": "/no_weights_model/", "mode": "rw"}}
         original_model, self.config.model = self.config.model, "/no_weights_model/"
diff --git a/tests/configs/cpu_inference_py_txi_gpt2.yaml b/tests/configs/cpu_inference_py_txi_gpt2.yaml
index 23c18416..1aef598e 100644
--- a/tests/configs/cpu_inference_py_txi_gpt2.yaml
+++ b/tests/configs/cpu_inference_py_txi_gpt2.yaml
@@ -9,7 +9,3 @@ defaults:
   - override backend: py-txi
 
 name: cpu_inference_py_txi_gpt2
-
-backend:
-  cuda_graphs: 0
-  image: ghcr.io/huggingface/text-generation-inference:2.4.1
diff --git a/tests/configs/cuda_inference_py_txi_gpt2.yaml b/tests/configs/cuda_inference_py_txi_gpt2.yaml
index 75e20094..73a5c10a 100644
--- a/tests/configs/cuda_inference_py_txi_gpt2.yaml
+++ b/tests/configs/cuda_inference_py_txi_gpt2.yaml
@@ -3,13 +3,8 @@ defaults:
   - _base_ # inherits from base config
   - _cuda_ # inherits from cuda config
   - _inference_ # inherits from inference config
-  - _no_weights_ # inherits from no weights config
   - _gpt2_ # inherits from gpt2 config
   - _self_ # hydra 1.1 compatibility
   - override backend: py-txi
 
 name: cuda_inference_py_txi_gpt2
-
-backend:
-  cuda_graphs: 0
-  image: ghcr.io/huggingface/text-generation-inference:2.4.1

From 9c1cd0ca8e2cafa96ac4d71bd0622a62eb0b5a75 Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Mon, 16 Dec 2024 19:10:34 +0100
Subject: [PATCH 22/30] test

---
 optimum_benchmark/backends/py_txi/backend.py  | 25 ++++++++-----------
 tests/configs/cuda_inference_py_txi_gpt2.yaml |  1 +
 2 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/optimum_benchmark/backends/py_txi/backend.py b/optimum_benchmark/backends/py_txi/backend.py
index 00e1044d..912ea86c 100644
--- a/optimum_benchmark/backends/py_txi/backend.py
+++ b/optimum_benchmark/backends/py_txi/backend.py
@@ -1,9 +1,10 @@
 import os
+from pathlib import Path
 from tempfile import TemporaryDirectory
 from typing import Any, Dict, List
 
 import torch
-from huggingface_hub import snapshot_download
+from huggingface_hub import hf_hub_download, snapshot_download
 from py_txi import TEI, TGI, TEIConfig, TGIConfig
 from safetensors.torch import save_model
 
@@ -45,33 +46,29 @@ def download_pretrained_model(self) -> None:
             self.generation_config.save_pretrained(save_directory=model_snapshot_folder)
 
     def create_no_weights_model(self) -> None:
-        self.no_weights_model = os.path.join(self.tmpdir.name, "no_weights_model")
-        filename = os.path.join(self.no_weights_model, "model.safetensors")
-        os.makedirs(self.no_weights_model, exist_ok=True)
+        model_path = Path(hf_hub_download(self.config.model, filename="config.json", cache_dir=self.tmpdir.name)).parent
+        save_model(model=torch.nn.Linear(1, 1), filename=model_path / "model.safetensors", metadata={"format": "pt"})
+        self.pretrained_processor.save_pretrained(save_directory=model_path)
+        self.pretrained_config.save_pretrained(save_directory=model_path)
 
-        self.pretrained_config.save_pretrained(save_directory=self.no_weights_model)
-        self.pretrained_processor.save_pretrained(save_directory=self.no_weights_model)
-
-        save_model(model=torch.nn.Linear(1, 1), filename=filename, metadata={"format": "pt"})
         with fast_weights_init():
             # unlike Transformers, TXI won't accept any missing tensors so we need to materialize the model
             self.pretrained_model = self.automodel_loader.from_pretrained(
-                self.no_weights_model, **self.config.model_kwargs, device_map="auto", _fast_init=False
+                model_path, **self.config.model_kwargs, device_map="auto", _fast_init=False
             )
-        save_model(model=self.pretrained_model, filename=filename, metadata={"format": "pt"})
+        save_model(model=self.pretrained_model, filename=model_path / "model.safetensors", metadata={"format": "pt"})
         del self.pretrained_model
         torch.cuda.empty_cache()
 
         if self.config.task in TEXT_GENERATION_TASKS:
             self.generation_config.eos_token_id = None
             self.generation_config.pad_token_id = None
-            self.generation_config.save_pretrained(save_directory=self.no_weights_model)
+            self.generation_config.save_pretrained(save_directory=model_path)
 
     def load_model_with_no_weights(self) -> None:
-        self.config.volumes = {self.no_weights_model: {"bind": "/no_weights_model/", "mode": "rw"}}
-        original_model, self.config.model = self.config.model, "/no_weights_model/"
+        original_volumes, self.config.volumes = self.config.volumes, {self.tmpdir.name: {"bind": "/data", "mode": "rw"}}
         self.load_model_from_pretrained()
-        self.config.model = original_model
+        self.config.volumes = original_volumes
 
     def load_model_from_pretrained(self) -> None:
         if self.config.task in TEXT_GENERATION_TASKS:
diff --git a/tests/configs/cuda_inference_py_txi_gpt2.yaml b/tests/configs/cuda_inference_py_txi_gpt2.yaml
index 73a5c10a..1c93ac36 100644
--- a/tests/configs/cuda_inference_py_txi_gpt2.yaml
+++ b/tests/configs/cuda_inference_py_txi_gpt2.yaml
@@ -3,6 +3,7 @@ defaults:
   - _base_ # inherits from base config
   - _cuda_ # inherits from cuda config
   - _inference_ # inherits from inference config
+  - _no_weights_ # inherits from no weights config
   - _gpt2_ # inherits from gpt2 config
   - _self_ # hydra 1.1 compatibility
   - override backend: py-txi

From 95ee4b455d6b9a554d4f446abfd3435d5eeb2324 Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Mon, 16 Dec 2024 19:10:50 +0100
Subject: [PATCH 23/30] style

---
 optimum_benchmark/backends/py_txi/backend.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/optimum_benchmark/backends/py_txi/backend.py b/optimum_benchmark/backends/py_txi/backend.py
index 912ea86c..e25e2def 100644
--- a/optimum_benchmark/backends/py_txi/backend.py
+++ b/optimum_benchmark/backends/py_txi/backend.py
@@ -1,4 +1,3 @@
-import os
 from pathlib import Path
 from tempfile import TemporaryDirectory
 from typing import Any, Dict, List

From e17e220701eef4eb7f83265976187b99670b618f Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Mon, 16 Dec 2024 20:23:25 +0100
Subject: [PATCH 24/30] test

---
 optimum_benchmark/backends/py_txi/backend.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/optimum_benchmark/backends/py_txi/backend.py b/optimum_benchmark/backends/py_txi/backend.py
index e25e2def..f357936b 100644
--- a/optimum_benchmark/backends/py_txi/backend.py
+++ b/optimum_benchmark/backends/py_txi/backend.py
@@ -1,6 +1,6 @@
 from pathlib import Path
 from tempfile import TemporaryDirectory
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Union
 
 import torch
 from huggingface_hub import hf_hub_download, snapshot_download
@@ -15,6 +15,7 @@
 
 class PyTXIBackend(Backend[PyTXIConfig]):
     NAME: str = "py-txi"
+    pretrained_model: Union[TEI, TGI]
 
     def __init__(self, config: PyTXIConfig) -> None:
         super().__init__(config)
@@ -65,7 +66,10 @@ def create_no_weights_model(self) -> None:
             self.generation_config.save_pretrained(save_directory=model_path)
 
     def load_model_with_no_weights(self) -> None:
-        original_volumes, self.config.volumes = self.config.volumes, {self.tmpdir.name: {"bind": "/data", "mode": "rw"}}
+        original_volumes, self.config.volumes = (
+            self.config.volumes,
+            {Path(self.tmpdir.name) / "hub": {"bind": "/data", "mode": "rw"}},
+        )
         self.load_model_from_pretrained()
         self.config.volumes = original_volumes
 

From 307df8112f0e03ce5269e8f05ae8edd6fd215c96 Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Mon, 16 Dec 2024 20:28:04 +0100
Subject: [PATCH 25/30] test

---
 optimum_benchmark/backends/py_txi/backend.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/optimum_benchmark/backends/py_txi/backend.py b/optimum_benchmark/backends/py_txi/backend.py
index f357936b..21237136 100644
--- a/optimum_benchmark/backends/py_txi/backend.py
+++ b/optimum_benchmark/backends/py_txi/backend.py
@@ -1,3 +1,4 @@
+import os
 from pathlib import Path
 from tempfile import TemporaryDirectory
 from typing import Any, Dict, List, Union
@@ -68,7 +69,7 @@ def create_no_weights_model(self) -> None:
     def load_model_with_no_weights(self) -> None:
         original_volumes, self.config.volumes = (
             self.config.volumes,
-            {Path(self.tmpdir.name) / "hub": {"bind": "/data", "mode": "rw"}},
+            {self.tmpdir.name: {"bind": "/data/hub/", "mode": "rw"}},
         )
         self.load_model_from_pretrained()
         self.config.volumes = original_volumes

From 7336fce60583d806f9a3d7a43c644d594e740e82 Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Mon, 16 Dec 2024 20:38:03 +0100
Subject: [PATCH 26/30] catch errors

---
 optimum_benchmark/backends/py_txi/backend.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/optimum_benchmark/backends/py_txi/backend.py b/optimum_benchmark/backends/py_txi/backend.py
index 21237136..e7e298f4 100644
--- a/optimum_benchmark/backends/py_txi/backend.py
+++ b/optimum_benchmark/backends/py_txi/backend.py
@@ -1,4 +1,4 @@
-import os
+import shutil
 from pathlib import Path
 from tempfile import TemporaryDirectory
 from typing import Any, Dict, List, Union
@@ -36,7 +36,10 @@ def load(self) -> None:
             self.logger.info("\t+ Loading pretrained model")
             self.load_model_from_pretrained()
 
-        self.tmpdir.cleanup()
+        try:
+            self.tmpdir.cleanup()
+        except Exception:
+            shutil.rmtree(self.tmpdir.name)
 
     def download_pretrained_model(self) -> None:
         model_snapshot_folder = snapshot_download(self.config.model, **self.config.model_kwargs)
@@ -49,6 +52,7 @@ def download_pretrained_model(self) -> None:
     def create_no_weights_model(self) -> None:
         model_path = Path(hf_hub_download(self.config.model, filename="config.json", cache_dir=self.tmpdir.name)).parent
         save_model(model=torch.nn.Linear(1, 1), filename=model_path / "model.safetensors", metadata={"format": "pt"})
+
         self.pretrained_processor.save_pretrained(save_directory=model_path)
         self.pretrained_config.save_pretrained(save_directory=model_path)
 
@@ -57,6 +61,7 @@ def create_no_weights_model(self) -> None:
             self.pretrained_model = self.automodel_loader.from_pretrained(
                 model_path, **self.config.model_kwargs, device_map="auto", _fast_init=False
             )
+
         save_model(model=self.pretrained_model, filename=model_path / "model.safetensors", metadata={"format": "pt"})
         del self.pretrained_model
         torch.cuda.empty_cache()

From 942f0e07acdd70b2838284d83fb75c838ab64c46 Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Mon, 16 Dec 2024 20:42:20 +0100
Subject: [PATCH 27/30] ignore errors

---
 optimum_benchmark/backends/py_txi/backend.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/optimum_benchmark/backends/py_txi/backend.py b/optimum_benchmark/backends/py_txi/backend.py
index e7e298f4..3acd2e42 100644
--- a/optimum_benchmark/backends/py_txi/backend.py
+++ b/optimum_benchmark/backends/py_txi/backend.py
@@ -39,7 +39,7 @@ def load(self) -> None:
         try:
             self.tmpdir.cleanup()
         except Exception:
-            shutil.rmtree(self.tmpdir.name)
+            shutil.rmtree(self.tmpdir.name, ignore_errors=True)
 
     def download_pretrained_model(self) -> None:
         model_snapshot_folder = snapshot_download(self.config.model, **self.config.model_kwargs)

From 9d5cf6cca0449996de328e7b2354014b53bec4a2 Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Mon, 16 Dec 2024 21:12:06 +0100
Subject: [PATCH 28/30] test

---
 optimum_benchmark/backends/py_txi/backend.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/optimum_benchmark/backends/py_txi/backend.py b/optimum_benchmark/backends/py_txi/backend.py
index 3acd2e42..014af25f 100644
--- a/optimum_benchmark/backends/py_txi/backend.py
+++ b/optimum_benchmark/backends/py_txi/backend.py
@@ -74,7 +74,7 @@ def create_no_weights_model(self) -> None:
     def load_model_with_no_weights(self) -> None:
         original_volumes, self.config.volumes = (
             self.config.volumes,
-            {self.tmpdir.name: {"bind": "/data/hub/", "mode": "rw"}},
+            {self.tmpdir.name: {"bind": "/data", "mode": "rw"}},
         )
         self.load_model_from_pretrained()
         self.config.volumes = original_volumes

From 2e76c97470bebb9c95b330b7465c6e87b98cdc4a Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Tue, 17 Dec 2024 09:37:47 +0100
Subject: [PATCH 29/30] test

---
 examples/cuda_tgi_llama.yaml                 |  2 +-
 optimum_benchmark/backends/py_txi/backend.py | 11 +++++------
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/examples/cuda_tgi_llama.yaml b/examples/cuda_tgi_llama.yaml
index 16d2f5f0..a32060b1 100644
--- a/examples/cuda_tgi_llama.yaml
+++ b/examples/cuda_tgi_llama.yaml
@@ -16,7 +16,7 @@ backend:
   device: cuda
   device_ids: 0
   cuda_graphs: 0 # remove for better perf but bigger memory footprint
-  no_weights: false
+  no_weights: true
   model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
 
 scenario:
diff --git a/optimum_benchmark/backends/py_txi/backend.py b/optimum_benchmark/backends/py_txi/backend.py
index 014af25f..55aecab9 100644
--- a/optimum_benchmark/backends/py_txi/backend.py
+++ b/optimum_benchmark/backends/py_txi/backend.py
@@ -59,7 +59,10 @@ def create_no_weights_model(self) -> None:
         with fast_weights_init():
             # unlike Transformers, TXI won't accept any missing tensors so we need to materialize the model
             self.pretrained_model = self.automodel_loader.from_pretrained(
-                model_path, **self.config.model_kwargs, device_map="auto", _fast_init=False
+                model_path,
+                _fast_init=False,
+                device_map="auto",
+                **self.config.model_kwargs,
             )
 
         save_model(model=self.pretrained_model, filename=model_path / "model.safetensors", metadata={"format": "pt"})
@@ -72,12 +75,8 @@ def create_no_weights_model(self) -> None:
             self.generation_config.save_pretrained(save_directory=model_path)
 
     def load_model_with_no_weights(self) -> None:
-        original_volumes, self.config.volumes = (
-            self.config.volumes,
-            {self.tmpdir.name: {"bind": "/data", "mode": "rw"}},
-        )
+        self.config.volumes = {self.tmpdir.name: {"bind": "/data", "mode": "rw"}}
         self.load_model_from_pretrained()
-        self.config.volumes = original_volumes
 
     def load_model_from_pretrained(self) -> None:
         if self.config.task in TEXT_GENERATION_TASKS:

From e29697953f19c698f36b329569392c91c5783cc6 Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Tue, 17 Dec 2024 10:26:50 +0100
Subject: [PATCH 30/30] update readme

---
 README.md                       | 18 ++++++------------
 examples/cuda_pytorch_bert.yaml |  2 +-
 2 files changed, 7 insertions(+), 13 deletions(-)

diff --git a/README.md b/README.md
index 6358b341..9203b778 100644
--- a/README.md
+++ b/README.md
@@ -50,7 +50,6 @@ Optimum-Benchmark is continuously and intensively tested on a variety of devices
 
 [![CLI_CPU_IPEX](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_ipex.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_ipex.yaml)
 [![CLI_CPU_LLAMA_CPP](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_llama_cpp.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_llama_cpp.yaml)
-[![CLI_CPU_NEURAL_COMPRESSOR](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_neural_compressor.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_neural_compressor.yaml)
 [![CLI_CPU_ONNXRUNTIME](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_onnxruntime.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_onnxruntime.yaml)
 [![CLI_CPU_OPENVINO](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_openvino.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_openvino.yaml)
 [![CLI_CPU_PYTORCH](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_pytorch.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_pytorch.yaml)
@@ -61,7 +60,6 @@ Optimum-Benchmark is continuously and intensively tested on a variety of devices
 [![CLI_CUDA_TENSORRT_LLM](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cuda_tensorrt_llm.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cuda_tensorrt_llm.yaml)
 [![CLI_CUDA_TORCH_ORT](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cuda_torch_ort.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cuda_torch_ort.yaml)
 [![CLI_CUDA_VLLM](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cuda_vllm.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cuda_vllm.yaml)
-[![CLI_ENERGY_STAR](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_energy_star.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_energy_star.yaml)
 [![CLI_MISC](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_misc.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_misc.yaml)
 [![CLI_ROCM_PYTORCH](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_rocm_pytorch.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_rocm_pytorch.yaml)
 
@@ -100,10 +98,9 @@ Depending on the backends you want to use, you can install `optimum-benchmark` w
 - OnnxRuntime: `pip install optimum-benchmark[onnxruntime]`
 - TensorRT-LLM: `pip install optimum-benchmark[tensorrt-llm]`
 - OnnxRuntime-GPU: `pip install optimum-benchmark[onnxruntime-gpu]`
-- Neural Compressor: `pip install optimum-benchmark[neural-compressor]`
-- Py-TXI: `pip install optimum-benchmark[py-txi]`
-- IPEX: `pip install optimum-benchmark[ipex]`
+- Py-TXI (TGI & TEI): `pip install optimum-benchmark[py-txi]`
 - vLLM: `pip install optimum-benchmark[vllm]`
+- IPEX: `pip install optimum-benchmark[ipex]`
 
 We also support the following extra extra dependencies:
 
@@ -144,9 +141,6 @@ if __name__ == "__main__":
     )
     benchmark_report = Benchmark.launch(benchmark_config)
 
-    # log the benchmark in terminal
-    benchmark_report.log() # or print(benchmark_report)
-
     # convert artifacts to a dictionary or dataframe
     benchmark_config.to_dict() # or benchmark_config.to_dataframe()
 
@@ -175,15 +169,17 @@ If you're on VSCode, you can hover over the configuration classes to see the ava
 You can also run a benchmark using the command line by specifying the configuration directory and the configuration name. Both arguments are mandatory for [`hydra`](https://hydra.cc/). `--config-dir` is the directory where the configuration files are stored and `--config-name` is the name of the configuration file without its `.yaml` extension.
 
 ```bash
-optimum-benchmark --config-dir examples/ --config-name pytorch_bert
+optimum-benchmark --config-dir examples/ --config-name cuda_pytorch_bert
 ```
 
-This will run the benchmark using the configuration in [`examples/pytorch_bert.yaml`](examples/pytorch_bert.yaml) and store the results in `runs/pytorch_bert`.
+This will run the benchmark using the configuration in [`examples/cuda_pytorch_bert.yaml`](examples/cuda_pytorch_bert.yaml) and store the results in `runs/cuda_pytorch_bert`.
 
 The resulting files are :
 
 - `benchmark_config.json` which contains the configuration used for the benchmark, including the backend, launcher, scenario and the environment in which the benchmark was run.
 - `benchmark_report.json` which contains a full report of the benchmark's results, like latency measurements, memory usage, energy consumption, etc.
+- `benchmark_report.txt` which contains a detailed report of the benchmark's results, in the same format they were logged.
+- `benchmark_report.md` which contains a detailed report of the benchmark's results, in markdown format.
 - `benchmark.json` contains both the report and the configuration in a single file.
 - `benchmark.log` contains the logs of the benchmark run.
 
@@ -309,9 +305,7 @@ For more information on the features of each backend, you can check their respec
 - [PyTorchConfig](optimum_benchmark/backends/pytorch/config.py)
 - [ORTConfig](optimum_benchmark/backends/onnxruntime/config.py)
 - [TorchORTConfig](optimum_benchmark/backends/torch_ort/config.py)
-- [LLMSwarmConfig](optimum_benchmark/backends/llm_swarm/config.py)
 - [TRTLLMConfig](optimum_benchmark/backends/tensorrt_llm/config.py)
-- [INCConfig](optimum_benchmark/backends/neural_compressor/config.py)
 
 </details>
 
diff --git a/examples/cuda_pytorch_bert.yaml b/examples/cuda_pytorch_bert.yaml
index 8ab9b5cb..195e8a02 100644
--- a/examples/cuda_pytorch_bert.yaml
+++ b/examples/cuda_pytorch_bert.yaml
@@ -6,7 +6,7 @@ defaults:
   - _base_
   - _self_
 
-name: pytorch_bert
+name: cuda_pytorch_bert
 
 launcher:
   device_isolation: true