From cc2f77c4977c37ed8a86fc76f7a30a036a005449 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Fri, 13 Dec 2024 13:52:18 +0100 Subject: [PATCH 01/30] protect token --- optimum_benchmark/backends/config.py | 33 ++++++++++++-------- optimum_benchmark/task_utils.py | 45 +++++++++++++++++++++------- 2 files changed, 56 insertions(+), 22 deletions(-) diff --git a/optimum_benchmark/backends/config.py b/optimum_benchmark/backends/config.py index fc265d4d..c47b7366 100644 --- a/optimum_benchmark/backends/config.py +++ b/optimum_benchmark/backends/config.py @@ -22,13 +22,13 @@ class BackendConfig(ABC): version: str _target_: str + model: Optional[str] = None + processor: Optional[str] = None + task: Optional[str] = None library: Optional[str] = None model_type: Optional[str] = None - model: Optional[str] = None - processor: Optional[str] = None - device: Optional[str] = None # we use a string here instead of a list # because it's easier to pass in a yaml or from cli @@ -48,30 +48,44 @@ def __post_init__(self): if self.model is None: raise ValueError("`model` must be specified.") + if self.model_kwargs.get("token", None) is not None: + LOGGER.info( + "You have passed an argument `token` to `model_kwargs`. This is dangerous as the config cannot do encryption to protect it. " + "We will proceed to registering `token` in the environment as `HF_TOKEN` to avoid saving it or pushing it to the hub by mistake." + ) + os.environ["HF_TOKEN"] = self.model_kwargs.pop("token") + if self.processor is None: self.processor = self.model - # TODO: add cache_dir, token, etc. to these methods + if not self.processor_kwargs: + self.processor_kwargs = self.model_kwargs + if self.library is None: self.library = infer_library_from_model_name_or_path( model_name_or_path=self.model, - token=self.model_kwargs.get("token", None), revision=self.model_kwargs.get("revision", None), + cache_dir=self.model_kwargs.get("cache_dir", None), + ) + + if self.library not in ["transformers", "diffusers", "timm", "llama_cpp"]: + raise ValueError( + f"`library` must be either `transformers`, `diffusers`, `timm` or `llama_cpp`, but got {self.library}" ) if self.task is None: self.task = infer_task_from_model_name_or_path( model_name_or_path=self.model, - token=self.model_kwargs.get("token", None), revision=self.model_kwargs.get("revision", None), + cache_dir=self.model_kwargs.get("cache_dir", None), library_name=self.library, ) if self.model_type is None: self.model_type = infer_model_type_from_model_name_or_path( model_name_or_path=self.model, - token=self.model_kwargs.get("token", None), revision=self.model_kwargs.get("revision", None), + cache_dir=self.model_kwargs.get("cache_dir", None), library_name=self.library, ) @@ -103,11 +117,6 @@ def __post_init__(self): else: raise RuntimeError("CUDA device is only supported on systems with NVIDIA or ROCm drivers.") - if self.library not in ["transformers", "diffusers", "timm", "llama_cpp"]: - raise ValueError( - f"`library` must be either `transformers`, `diffusers`, `timm` or `llama_cpp`, but got {self.library}" - ) - if self.inter_op_num_threads is not None: if self.inter_op_num_threads == -1: self.inter_op_num_threads = cpu_count() diff --git a/optimum_benchmark/task_utils.py b/optimum_benchmark/task_utils.py index 7c066d14..45e3a342 100644 --- a/optimum_benchmark/task_utils.py +++ b/optimum_benchmark/task_utils.py @@ -155,7 +155,11 @@ def is_local_dir_repo(model_name_or_path: str) -> bool: def get_repo_config( - model_name_or_path: str, config_name: str, token: Optional[str] = None, revision: Optional[str] = None + model_name_or_path: str, + config_name: str, + token: Optional[str] = None, + revision: Optional[str] = None, + cache_dir: Optional[str] = None, ): if is_hf_hub_repo(model_name_or_path, token=token): config = json.load( @@ -163,6 +167,7 @@ def get_repo_config( huggingface_hub.hf_hub_download( repo_id=model_name_or_path, filename=config_name, + cache_dir=cache_dir, revision=revision, token=token, ), @@ -197,6 +202,7 @@ def infer_library_from_model_name_or_path( model_name_or_path: str, token: Optional[str] = None, revision: Optional[str] = None, + cache_dir: Optional[str] = None, ) -> str: inferred_library_name = None @@ -209,7 +215,9 @@ def infer_library_from_model_name_or_path( inferred_library_name = "sentence-transformers" elif "config.json" in repo_files: - config_dict = get_repo_config(model_name_or_path, "config.json", token=token, revision=revision) + config_dict = get_repo_config( + model_name_or_path, "config.json", token=token, revision=revision, cache_dir=cache_dir + ) if "pretrained_cfg" in config_dict: inferred_library_name = "timm" @@ -229,12 +237,15 @@ def infer_task_from_model_name_or_path( model_name_or_path: str, token: Optional[str] = None, revision: Optional[str] = None, + cache_dir: Optional[str] = None, library_name: Optional[str] = None, ) -> str: inferred_task_name = None if library_name is None: - library_name = infer_library_from_model_name_or_path(model_name_or_path, revision=revision, token=token) + library_name = infer_library_from_model_name_or_path( + model_name_or_path, revision=revision, token=token, cache_dir=cache_dir + ) if library_name == "llama_cpp": inferred_task_name = "text-generation" @@ -243,7 +254,9 @@ def infer_task_from_model_name_or_path( inferred_task_name = "image-classification" elif library_name == "transformers": - transformers_config = get_repo_config(model_name_or_path, "config.json", token=token, revision=revision) + transformers_config = get_repo_config( + model_name_or_path, "config.json", token=token, revision=revision, cache_dir=cache_dir + ) target_class_name = transformers_config["architectures"][0] for task_name, model_mapping in TASKS_TO_MODEL_TYPES_TO_MODEL_CLASS_NAMES.items(): @@ -258,7 +271,9 @@ def infer_task_from_model_name_or_path( raise KeyError(f"Could not find the proper task name for target class name {target_class_name}.") elif library_name == "diffusers": - diffusers_config = get_repo_config(model_name_or_path, "model_index.json", token=token, revision=revision) + diffusers_config = get_repo_config( + model_name_or_path, "model_index.json", token=token, revision=revision, cache_dir=cache_dir + ) target_class_name = diffusers_config["_class_name"] for task_name, pipeline_mapping in TASKS_TO_PIPELINE_TYPES_TO_PIPELINE_CLASS_NAMES.items(): @@ -279,26 +294,35 @@ def infer_model_type_from_model_name_or_path( model_name_or_path: str, token: Optional[str] = None, revision: Optional[str] = None, + cache_dir: Optional[str] = None, library_name: Optional[str] = None, ) -> str: inferred_model_type = None if library_name is None: - library_name = infer_library_from_model_name_or_path(model_name_or_path, revision=revision, token=token) + library_name = infer_library_from_model_name_or_path( + model_name_or_path, revision=revision, token=token, cache_dir=cache_dir + ) if library_name == "llama_cpp": inferred_model_type = "llama_cpp" elif library_name == "timm": - timm_config = get_repo_config(model_name_or_path, "config.json", token=token, revision=revision) + timm_config = get_repo_config( + model_name_or_path, "config.json", token=token, revision=revision, cache_dir=cache_dir + ) inferred_model_type = timm_config["architecture"] elif library_name == "transformers": - transformers_config = get_repo_config(model_name_or_path, "config.json", token=token, revision=revision) + transformers_config = get_repo_config( + model_name_or_path, "config.json", token=token, revision=revision, cache_dir=cache_dir + ) inferred_model_type = transformers_config["model_type"] elif library_name == "diffusers": - diffusers_config = get_repo_config(model_name_or_path, "model_index.json", token=token, revision=revision) + diffusers_config = get_repo_config( + model_name_or_path, "model_index.json", token=token, revision=revision, cache_dir=cache_dir + ) target_class_name = diffusers_config["_class_name"] for _, pipeline_mapping in TASKS_TO_PIPELINE_TYPES_TO_PIPELINE_CLASS_NAMES.items(): @@ -310,6 +334,7 @@ def infer_model_type_from_model_name_or_path( break if inferred_model_type is None: - raise KeyError(f"Could not find the proper model type for target class name {target_class_name}.") + # we use the class name in this case + inferred_model_type = target_class_name.replace("DiffusionPipeline", "").replace("Pipeline", "") return inferred_model_type From 0b2f878ea24a4c31922e687ab0d93fab7acfaee5 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Fri, 13 Dec 2024 14:35:41 +0100 Subject: [PATCH 02/30] no_weights for TXI --- optimum_benchmark/backends/py_txi/backend.py | 4 +--- optimum_benchmark/backends/py_txi/config.py | 15 ++++++--------- tests/configs/cpu_inference_py_txi_gpt2.yaml | 1 + tests/configs/cpu_inference_py_txi_st_bert.yaml | 1 + tests/configs/cuda_inference_py_txi_gpt2.yaml | 1 + tests/configs/cuda_inference_py_txi_st_bert.yaml | 1 + 6 files changed, 11 insertions(+), 12 deletions(-) diff --git a/optimum_benchmark/backends/py_txi/backend.py b/optimum_benchmark/backends/py_txi/backend.py index 6e637a31..1b02277a 100644 --- a/optimum_benchmark/backends/py_txi/backend.py +++ b/optimum_benchmark/backends/py_txi/backend.py @@ -53,7 +53,6 @@ def download_pretrained_model(self) -> None: def prepare_generation_config(self) -> None: self.generation_config.eos_token_id = None self.generation_config.pad_token_id = None - model_cache_folder = f"models/{self.config.model}".replace("/", "--") model_cache_path = f"{self.volume}/{model_cache_folder}" snapshot_file = f"{model_cache_path}/refs/{self.config.model_kwargs.get('revision', 'main')}" @@ -95,8 +94,7 @@ def create_no_weights_model(self) -> None: def load_model_with_no_weights(self) -> None: original_volumes, self.config.volumes = self.config.volumes, {self.tmpdir.name: {"bind": "/data", "mode": "rw"}} - original_model, self.config.model = self.config.model, "/data/no_weights_model" - self.logger.info("\t+ Loading no weights model") + original_model, self.config.model = self.config.model, "/data/no_weights_model/" self.load_model_from_pretrained() self.config.model, self.config.volumes = original_model, original_volumes diff --git a/optimum_benchmark/backends/py_txi/config.py b/optimum_benchmark/backends/py_txi/config.py index dae410c4..2bf6c04c 100644 --- a/optimum_benchmark/backends/py_txi/config.py +++ b/optimum_benchmark/backends/py_txi/config.py @@ -22,7 +22,7 @@ class PyTXIConfig(BackendConfig): # Image to use for the container image: Optional[str] = None # Shared memory size for the container - shm_size: str = "1g" + shm_size: Optional[str] = None # List of custom devices to forward to the container e.g. ["/dev/kfd", "/dev/dri"] for ROCm devices: Optional[List[str]] = None # NVIDIA-docker GPU device options e.g. "all" (all) or "0,1,2,3" (ids) or 4 (count) @@ -41,9 +41,13 @@ class PyTXIConfig(BackendConfig): metadata={"help": "List of environment variables to forward to the container from the host."}, ) + # first connection/request + connection_timeout: int = 60 + first_request_timeout: int = 60 + max_concurrent_requests: Optional[int] = None + # Common options dtype: Optional[str] = None - max_concurrent_requests: Optional[int] = None # TGI specific sharded: Optional[str] = None @@ -72,13 +76,6 @@ def __post_init__(self): renderDs = [file for file in os.listdir("/dev/dri") if file.startswith("renderD")] self.devices = ["/dev/kfd"] + [f"/dev/dri/{renderDs[i]}" for i in ids] - # Common options - if self.max_concurrent_requests is None: - if self.task in TEXT_GENERATION_TASKS: - self.max_concurrent_requests = 128 - elif self.task in TEXT_EMBEDDING_TASKS: - self.max_concurrent_requests = 512 - # TGI specific if self.task in TEXT_GENERATION_TASKS: if self.trust_remote_code is None: diff --git a/tests/configs/cpu_inference_py_txi_gpt2.yaml b/tests/configs/cpu_inference_py_txi_gpt2.yaml index 76e90775..1aef598e 100644 --- a/tests/configs/cpu_inference_py_txi_gpt2.yaml +++ b/tests/configs/cpu_inference_py_txi_gpt2.yaml @@ -3,6 +3,7 @@ defaults: - _base_ # inherits from base config - _cpu_ # inherits from cpu config - _inference_ # inherits from inference config + - _no_weights_ # inherits from no weights config - _gpt2_ # inherits from gpt2 config - _self_ # hydra 1.1 compatibility - override backend: py-txi diff --git a/tests/configs/cpu_inference_py_txi_st_bert.yaml b/tests/configs/cpu_inference_py_txi_st_bert.yaml index 2650e1bf..99e571b5 100644 --- a/tests/configs/cpu_inference_py_txi_st_bert.yaml +++ b/tests/configs/cpu_inference_py_txi_st_bert.yaml @@ -3,6 +3,7 @@ defaults: - _base_ # inherits from base config - _cpu_ # inherits from cpu config - _inference_ # inherits from inference config + - _no_weights_ # inherits from no weights config - _st_bert_ # inherits from bert config - _self_ # hydra 1.1 compatibility - override backend: py-txi diff --git a/tests/configs/cuda_inference_py_txi_gpt2.yaml b/tests/configs/cuda_inference_py_txi_gpt2.yaml index 73a5c10a..1c93ac36 100644 --- a/tests/configs/cuda_inference_py_txi_gpt2.yaml +++ b/tests/configs/cuda_inference_py_txi_gpt2.yaml @@ -3,6 +3,7 @@ defaults: - _base_ # inherits from base config - _cuda_ # inherits from cuda config - _inference_ # inherits from inference config + - _no_weights_ # inherits from no weights config - _gpt2_ # inherits from gpt2 config - _self_ # hydra 1.1 compatibility - override backend: py-txi diff --git a/tests/configs/cuda_inference_py_txi_st_bert.yaml b/tests/configs/cuda_inference_py_txi_st_bert.yaml index 8ae494e7..5bb38528 100644 --- a/tests/configs/cuda_inference_py_txi_st_bert.yaml +++ b/tests/configs/cuda_inference_py_txi_st_bert.yaml @@ -3,6 +3,7 @@ defaults: - _base_ # inherits from base config - _cuda_ # inherits from cuda config - _inference_ # inherits from inference config + - _no_weights_ # inherits from no weights config - _st_bert_ # inherits from bert config - _self_ # hydra 1.1 compatibility - override backend: py-txi From 6fb84c6eb4dd3f90e67ee56a27a8a6315cf925ea Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Fri, 13 Dec 2024 14:44:09 +0100 Subject: [PATCH 03/30] fix --- optimum_benchmark/backends/py_txi/config.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/optimum_benchmark/backends/py_txi/config.py b/optimum_benchmark/backends/py_txi/config.py index 2bf6c04c..9cc23a64 100644 --- a/optimum_benchmark/backends/py_txi/config.py +++ b/optimum_benchmark/backends/py_txi/config.py @@ -76,7 +76,9 @@ def __post_init__(self): renderDs = [file for file in os.listdir("/dev/dri") if file.startswith("renderD")] self.devices = ["/dev/kfd"] + [f"/dev/dri/{renderDs[i]}" for i in ids] - # TGI specific - if self.task in TEXT_GENERATION_TASKS: - if self.trust_remote_code is None: - self.trust_remote_code = self.model_kwargs.get("trust_remote_code", False) + # Common options + if self.max_concurrent_requests is None: + if self.task in TEXT_GENERATION_TASKS: + self.max_concurrent_requests = 128 + elif self.task in TEXT_EMBEDDING_TASKS: + self.max_concurrent_requests = 512 From e936705e57c447f17a1bb7a37449cd18c449250b Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Sun, 15 Dec 2024 18:02:05 +0100 Subject: [PATCH 04/30] test --- optimum_benchmark/backends/py_txi/backend.py | 6 +++--- optimum_benchmark/backends/py_txi/config.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/optimum_benchmark/backends/py_txi/backend.py b/optimum_benchmark/backends/py_txi/backend.py index 1b02277a..184f2518 100644 --- a/optimum_benchmark/backends/py_txi/backend.py +++ b/optimum_benchmark/backends/py_txi/backend.py @@ -93,10 +93,10 @@ def create_no_weights_model(self) -> None: self.generation_config.save_pretrained(save_directory=self.no_weights_model) def load_model_with_no_weights(self) -> None: - original_volumes, self.config.volumes = self.config.volumes, {self.tmpdir.name: {"bind": "/data", "mode": "rw"}} - original_model, self.config.model = self.config.model, "/data/no_weights_model/" + self.config.volumes[self.tmpdir.name] = {"bind": "/no_weights_data/", "mode": "rw"} + original_model, self.config.model = self.config.model, "/no_weights_data/no_weights_model/" self.load_model_from_pretrained() - self.config.model, self.config.volumes = original_model, original_volumes + self.config.model = original_model def load_model_from_pretrained(self) -> None: if self.config.task in TEXT_GENERATION_TASKS: diff --git a/optimum_benchmark/backends/py_txi/config.py b/optimum_benchmark/backends/py_txi/config.py index 9cc23a64..bde8ab43 100644 --- a/optimum_benchmark/backends/py_txi/config.py +++ b/optimum_benchmark/backends/py_txi/config.py @@ -37,7 +37,7 @@ class PyTXIConfig(BackendConfig): metadata={"help": "Dictionary of volumes to mount inside the container."}, ) environment: List[str] = field( - default_factory=lambda: ["HUGGING_FACE_HUB_TOKEN"], + default_factory=lambda: ["HF_TOKEN"], metadata={"help": "List of environment variables to forward to the container from the host."}, ) From 4289798baed85ab17e76946c5d7c7b93f9a7916a Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Sun, 15 Dec 2024 18:11:02 +0100 Subject: [PATCH 05/30] force txi sequential in cuda ci --- .github/workflows/test_cli_cuda_py_txi.yaml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test_cli_cuda_py_txi.yaml b/.github/workflows/test_cli_cuda_py_txi.yaml index 5c090b28..b8c50db0 100644 --- a/.github/workflows/test_cli_cuda_py_txi.yaml +++ b/.github/workflows/test_cli_cuda_py_txi.yaml @@ -48,7 +48,8 @@ jobs: pip install -e .[testing,py-txi] git+https://github.com/IlyasMoutawwakil/py-txi.git - name: Run tests - run: pytest tests/test_cli.py -x -s -k "cli and cuda and py_txi" + run: | + FORCE_SEQUENTIAL=1 pytest tests/test_cli.py -x -s -k "cli and cuda and py_txi" - if: ${{ (github.event_name == 'push') || @@ -56,4 +57,5 @@ jobs: contains( github.event.pull_request.labels.*.name, 'examples') }} name: Run examples - run: pytest tests/test_examples.py -x -s -k "cli and cuda and (tgi or tei)" + run: | + FORCE_SEQUENTIAL=1 pytest tests/test_examples.py -x -s -k "cli and cuda and (tgi or tei)" From ecaa6c87ad151ea16df184e8046d77ba5bb5f2f8 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Sun, 15 Dec 2024 18:46:29 +0100 Subject: [PATCH 06/30] test --- optimum_benchmark/backends/py_txi/backend.py | 27 +++++++------------ tests/configs/cpu_inference_py_txi_gpt2.yaml | 3 +++ tests/configs/cuda_inference_py_txi_gpt2.yaml | 3 +++ 3 files changed, 16 insertions(+), 17 deletions(-) diff --git a/optimum_benchmark/backends/py_txi/backend.py b/optimum_benchmark/backends/py_txi/backend.py index 184f2518..c56a4989 100644 --- a/optimum_benchmark/backends/py_txi/backend.py +++ b/optimum_benchmark/backends/py_txi/backend.py @@ -63,40 +63,33 @@ def prepare_generation_config(self) -> None: def create_no_weights_model(self) -> None: self.no_weights_model = os.path.join(self.tmpdir.name, "no_weights_model") - self.logger.info("\t+ Creating no weights model directory") os.makedirs(self.no_weights_model, exist_ok=True) - self.logger.info("\t+ Creating no weights model state dict") state_dict = torch.nn.Linear(1, 1).state_dict() - self.logger.info("\t+ Saving no weights model safetensors") safetensor = os.path.join(self.no_weights_model, "model.safetensors") save_file(tensors=state_dict, filename=safetensor, metadata={"format": "pt"}) - self.logger.info("\t+ Saving no weights model pretrained config") - self.pretrained_config.save_pretrained(save_directory=self.no_weights_model) - self.logger.info("\t+ Saving no weights model pretrained processor") - self.pretrained_processor.save_pretrained(save_directory=self.no_weights_model) - # unlike Transformers, TXI won't accept any missing tensors so we need to materialize the model - self.logger.info(f"\t+ Loading no weights model from {self.no_weights_model}") with fast_weights_init(): + # unlike Transformers, TXI won't accept any missing tensors so we need to materialize the model self.pretrained_model = self.automodel_loader.from_pretrained( self.no_weights_model, **self.config.model_kwargs, device_map="auto", _fast_init=False ) - self.logger.info("\t+ Saving no weights model") - self.pretrained_model.save_pretrained(save_directory=self.no_weights_model) + save_file(tensors=self.pretrained_model.state_dict(), filename=safetensor, metadata={"format": "pt"}) del self.pretrained_model torch.cuda.empty_cache() - if self.config.task in TEXT_GENERATION_TASKS: - self.logger.info("\t+ Modifying generation config for fixed length generation") + if self.pretrained_config is not None: + self.pretrained_config.save_pretrained(save_directory=self.no_weights_model) + if self.pretrained_processor is not None: + self.pretrained_processor.save_pretrained(save_directory=self.no_weights_model) + if self.generation_config is not None: self.generation_config.eos_token_id = None self.generation_config.pad_token_id = None - self.logger.info("\t+ Saving new pretrained generation config") self.generation_config.save_pretrained(save_directory=self.no_weights_model) def load_model_with_no_weights(self) -> None: - self.config.volumes[self.tmpdir.name] = {"bind": "/no_weights_data/", "mode": "rw"} - original_model, self.config.model = self.config.model, "/no_weights_data/no_weights_model/" + self.config.volumes = (self.config.volumes, {self.tmpdir.name: {"bind": self.tmpdir.name, "mode": "rw"}}) + original_model, self.config.model = self.config.model, self.no_weights_model self.load_model_from_pretrained() - self.config.model = original_model + self.config.model, self.config.volumes = original_model def load_model_from_pretrained(self) -> None: if self.config.task in TEXT_GENERATION_TASKS: diff --git a/tests/configs/cpu_inference_py_txi_gpt2.yaml b/tests/configs/cpu_inference_py_txi_gpt2.yaml index 1aef598e..82a522bd 100644 --- a/tests/configs/cpu_inference_py_txi_gpt2.yaml +++ b/tests/configs/cpu_inference_py_txi_gpt2.yaml @@ -9,3 +9,6 @@ defaults: - override backend: py-txi name: cpu_inference_py_txi_gpt2 + +backend: + cuda_graphs: 0 diff --git a/tests/configs/cuda_inference_py_txi_gpt2.yaml b/tests/configs/cuda_inference_py_txi_gpt2.yaml index 1c93ac36..d0d17dbc 100644 --- a/tests/configs/cuda_inference_py_txi_gpt2.yaml +++ b/tests/configs/cuda_inference_py_txi_gpt2.yaml @@ -9,3 +9,6 @@ defaults: - override backend: py-txi name: cuda_inference_py_txi_gpt2 + +backend: + cuda_graphs: 0 From ea328024a95ad2fdc3143b39735442d300d5fb6e Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Mon, 16 Dec 2024 11:34:46 +0100 Subject: [PATCH 07/30] test --- optimum_benchmark/backends/py_txi/backend.py | 150 ++++++++++++------- optimum_benchmark/backends/py_txi/config.py | 41 ++--- 2 files changed, 110 insertions(+), 81 deletions(-) diff --git a/optimum_benchmark/backends/py_txi/backend.py b/optimum_benchmark/backends/py_txi/backend.py index c56a4989..5afe1e1e 100644 --- a/optimum_benchmark/backends/py_txi/backend.py +++ b/optimum_benchmark/backends/py_txi/backend.py @@ -31,11 +31,6 @@ def load(self) -> None: else: self.logger.info("\t+ Downloading pretrained model") self.download_pretrained_model() - - if self.config.task in TEXT_GENERATION_TASKS: - self.logger.info("\t+ Preparing generation config") - self.prepare_generation_config() - self.logger.info("\t+ Loading pretrained model") self.load_model_from_pretrained() @@ -50,29 +45,30 @@ def download_pretrained_model(self) -> None: with init_empty_weights(include_buffers=True): self.automodel_loader.from_pretrained(self.config.model, **self.config.model_kwargs, cache_dir=self.volume) - def prepare_generation_config(self) -> None: - self.generation_config.eos_token_id = None - self.generation_config.pad_token_id = None - model_cache_folder = f"models/{self.config.model}".replace("/", "--") - model_cache_path = f"{self.volume}/{model_cache_folder}" - snapshot_file = f"{model_cache_path}/refs/{self.config.model_kwargs.get('revision', 'main')}" - snapshot_ref = open(snapshot_file, "r").read().strip() - model_snapshot_path = f"{model_cache_path}/snapshots/{snapshot_ref}" - self.logger.info("\t+ Saving new pretrained generation config") - self.generation_config.save_pretrained(save_directory=model_snapshot_path) + if self.config.task in TEXT_GENERATION_TASKS: + self.logger.info("\t+ Preparing generation config") + self.generation_config.eos_token_id = None + self.generation_config.pad_token_id = None + model_cache_folder = f"models/{self.config.model}".replace("/", "--") + model_cache_path = f"{self.volume}/{model_cache_folder}" + snapshot_file = f"{model_cache_path}/refs/{self.config.model_kwargs.get('revision', 'main')}" + snapshot_ref = open(snapshot_file, "r").read().strip() + model_snapshot_path = f"{model_cache_path}/snapshots/{snapshot_ref}" + self.logger.info("\t+ Saving pretrained generation config") + self.generation_config.save_pretrained(save_directory=model_snapshot_path) def create_no_weights_model(self) -> None: self.no_weights_model = os.path.join(self.tmpdir.name, "no_weights_model") + filename = os.path.join(self.no_weights_model, "model.safetensors") os.makedirs(self.no_weights_model, exist_ok=True) - state_dict = torch.nn.Linear(1, 1).state_dict() - safetensor = os.path.join(self.no_weights_model, "model.safetensors") - save_file(tensors=state_dict, filename=safetensor, metadata={"format": "pt"}) + + save_file(tensors=torch.nn.Linear(1, 1).state_dict(), filename=filename, metadata={"format": "pt"}) with fast_weights_init(): # unlike Transformers, TXI won't accept any missing tensors so we need to materialize the model self.pretrained_model = self.automodel_loader.from_pretrained( self.no_weights_model, **self.config.model_kwargs, device_map="auto", _fast_init=False ) - save_file(tensors=self.pretrained_model.state_dict(), filename=safetensor, metadata={"format": "pt"}) + save_file(tensors=self.pretrained_model.state_dict(), filename=filename, metadata={"format": "pt"}) del self.pretrained_model torch.cuda.empty_cache() @@ -80,56 +76,108 @@ def create_no_weights_model(self) -> None: self.pretrained_config.save_pretrained(save_directory=self.no_weights_model) if self.pretrained_processor is not None: self.pretrained_processor.save_pretrained(save_directory=self.no_weights_model) - if self.generation_config is not None: + + if self.config.task in TEXT_GENERATION_TASKS: self.generation_config.eos_token_id = None self.generation_config.pad_token_id = None self.generation_config.save_pretrained(save_directory=self.no_weights_model) def load_model_with_no_weights(self) -> None: - self.config.volumes = (self.config.volumes, {self.tmpdir.name: {"bind": self.tmpdir.name, "mode": "rw"}}) + self.config.volumes = {self.tmpdir.name: {"bind": self.tmpdir.name, "mode": "rw"}} original_model, self.config.model = self.config.model, self.no_weights_model self.load_model_from_pretrained() - self.config.model, self.config.volumes = original_model + self.config.model = original_model def load_model_from_pretrained(self) -> None: if self.config.task in TEXT_GENERATION_TASKS: self.pretrained_model = TGI( - config=TGIConfig( - model_id=self.config.model, - gpus=self.config.gpus, - devices=self.config.devices, - volumes=self.config.volumes, - environment=self.config.environment, - ports=self.config.ports, - dtype=self.config.dtype, - sharded=self.config.sharded, - quantize=self.config.quantize, - num_shard=self.config.num_shard, - speculate=self.config.speculate, - cuda_graphs=self.config.cuda_graphs, - disable_custom_kernels=self.config.disable_custom_kernels, - trust_remote_code=self.config.trust_remote_code, - max_concurrent_requests=self.config.max_concurrent_requests, - ), + config=TGIConfig(self.config.model, **self.txi_kwargs, **self.tgi_kwargs), ) - elif self.config.task in TEXT_EMBEDDING_TASKS: self.pretrained_model = TEI( - config=TEIConfig( - model_id=self.config.model, - gpus=self.config.gpus, - devices=self.config.devices, - volumes=self.config.volumes, - environment=self.config.environment, - ports=self.config.ports, - dtype=self.config.dtype, - pooling=self.config.pooling, - max_concurrent_requests=self.config.max_concurrent_requests, - ), + config=TEIConfig(self.config.model, **self.txi_kwargs, **self.tei_kwargs), ) else: raise NotImplementedError(f"TXI does not support task {self.config.task}") + @property + def txi_kwargs(self): + kwargs = {} + + if self.config.gpus is not None: + kwargs["gpus"] = self.config.gpus + + if self.config.image is not None: + kwargs["image"] = self.config.image + + if self.config.ports is not None: + kwargs["ports"] = self.config.ports + + if self.config.volumes is not None: + kwargs["volumes"] = self.config.volumes + + if self.config.devices is not None: + kwargs["devices"] = self.config.devices + + if self.config.shm_size is not None: + kwargs["shm_size"] = self.config.shm_size + + if self.config.environment is not None: + kwargs["environment"] = self.config.environment + + if self.config.connection_timeout is not None: + kwargs["connection_timeout"] = self.config.connection_timeout + + if self.config.first_request_timeout is not None: + kwargs["first_request_timeout"] = self.config.first_request_timeout + + if self.config.max_concurrent_requests is not None: + kwargs["max_concurrent_requests"] = self.config.max_concurrent_requests + + return kwargs + + @property + def tei_kwargs(self): + kwargs = {} + + if self.config.dtype is not None: + kwargs["dtype"] = self.config.dtype + + if self.config.pooling is not None: + kwargs["pooling"] = self.config.pooling + + return kwargs + + @property + def tgi_kwargs(self): + kwargs = {} + + if self.config.dtype is not None: + kwargs["dtype"] = self.config.dtype + + if self.config.sharded is not None: + kwargs["sharded"] = self.config.sharded + + if self.config.quantize is not None: + kwargs["quantize"] = self.config.quantize + + if self.config.num_shard is not None: + kwargs["num_shard"] = self.config.num_shard + + if self.config.speculate is not None: + kwargs["speculate"] = self.config.speculate + + if self.config.cuda_graphs is not None: + kwargs["cuda_graphs"] = self.config.cuda_graphs + + if self.config.trust_remote_code is not None: + kwargs["trust_remote_code"] = self.config.trust_remote_code + + if self.config.disable_custom_kernels is not None: + kwargs["disable_custom_kernels"] = self.config.disable_custom_kernels + + return kwargs + def prepare_inputs(self, inputs: Dict[str, Any]) -> Dict[str, Any]: if self.config.task in TEXT_GENERATION_TASKS: inputs = {"prompt": self.pretrained_processor.batch_decode(inputs["input_ids"].tolist())} diff --git a/optimum_benchmark/backends/py_txi/config.py b/optimum_benchmark/backends/py_txi/config.py index bde8ab43..3b4a908d 100644 --- a/optimum_benchmark/backends/py_txi/config.py +++ b/optimum_benchmark/backends/py_txi/config.py @@ -1,9 +1,7 @@ import os -from dataclasses import dataclass, field +from dataclasses import dataclass from typing import Any, Dict, List, Optional, Union -from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE - from ...import_utils import py_txi_version from ...system_utils import is_nvidia_system, is_rocm_system from ...task_utils import TEXT_EMBEDDING_TASKS, TEXT_GENERATION_TASKS @@ -16,7 +14,7 @@ class PyTXIConfig(BackendConfig): version: Optional[str] = py_txi_version() _target_: str = "optimum_benchmark.backends.py_txi.backend.PyTXIBackend" - # optimum benchmark specific + # optimum-benchmark specific no_weights: bool = False # Image to use for the container @@ -28,27 +26,18 @@ class PyTXIConfig(BackendConfig): # NVIDIA-docker GPU device options e.g. "all" (all) or "0,1,2,3" (ids) or 4 (count) gpus: Optional[Union[str, int]] = None # Things to forward to the container - ports: Dict[str, Any] = field( - default_factory=lambda: {"80/tcp": ("127.0.0.1", 0)}, - metadata={"help": "Dictionary of ports to expose from the container."}, - ) - volumes: Dict[str, Any] = field( - default_factory=lambda: {HUGGINGFACE_HUB_CACHE: {"bind": "/data", "mode": "rw"}}, - metadata={"help": "Dictionary of volumes to mount inside the container."}, - ) - environment: List[str] = field( - default_factory=lambda: ["HF_TOKEN"], - metadata={"help": "List of environment variables to forward to the container from the host."}, - ) - - # first connection/request - connection_timeout: int = 60 - first_request_timeout: int = 60 + ports: Optional[Dict[str, Any]] = None + environment: Optional[List[str]] = None + volumes: Optional[Dict[str, Any]] = None + # First connection/request + connection_timeout: Optional[int] = None + first_request_timeout: Optional[int] = None max_concurrent_requests: Optional[int] = None # Common options dtype: Optional[str] = None - + # TEI specific + pooling: Optional[str] = None # TGI specific sharded: Optional[str] = None quantize: Optional[str] = None @@ -58,9 +47,6 @@ class PyTXIConfig(BackendConfig): trust_remote_code: Optional[bool] = None disable_custom_kernels: Optional[bool] = None - # TEI specific - pooling: Optional[str] = None - def __post_init__(self): super().__post_init__() @@ -76,9 +62,4 @@ def __post_init__(self): renderDs = [file for file in os.listdir("/dev/dri") if file.startswith("renderD")] self.devices = ["/dev/kfd"] + [f"/dev/dri/{renderDs[i]}" for i in ids] - # Common options - if self.max_concurrent_requests is None: - if self.task in TEXT_GENERATION_TASKS: - self.max_concurrent_requests = 128 - elif self.task in TEXT_EMBEDDING_TASKS: - self.max_concurrent_requests = 512 + self.trust_remote_code = self.model_kwargs.get("trust_remote_code", None) From abec2b6a37ff0dc95ad2486ddf174e05c00e741e Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Mon, 16 Dec 2024 11:50:17 +0100 Subject: [PATCH 08/30] test --- optimum_benchmark/backends/py_txi/backend.py | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) diff --git a/optimum_benchmark/backends/py_txi/backend.py b/optimum_benchmark/backends/py_txi/backend.py index 5afe1e1e..81f1f6c3 100644 --- a/optimum_benchmark/backends/py_txi/backend.py +++ b/optimum_benchmark/backends/py_txi/backend.py @@ -3,7 +3,7 @@ from typing import Any, Dict, List import torch -from accelerate import init_empty_weights +from huggingface_hub import snapshot_download from py_txi import TEI, TGI, TEIConfig, TGIConfig from safetensors.torch import save_file @@ -36,26 +36,13 @@ def load(self) -> None: self.tmpdir.cleanup() - @property - def volume(self) -> str: - return list(self.config.volumes.keys())[0] - def download_pretrained_model(self) -> None: - # directly downloads pretrained model in volume (/data) to change generation config before loading model - with init_empty_weights(include_buffers=True): - self.automodel_loader.from_pretrained(self.config.model, **self.config.model_kwargs, cache_dir=self.volume) + model_snapshot_folder = snapshot_download(self.config.model, self.config.model_kwargs) if self.config.task in TEXT_GENERATION_TASKS: - self.logger.info("\t+ Preparing generation config") self.generation_config.eos_token_id = None self.generation_config.pad_token_id = None - model_cache_folder = f"models/{self.config.model}".replace("/", "--") - model_cache_path = f"{self.volume}/{model_cache_folder}" - snapshot_file = f"{model_cache_path}/refs/{self.config.model_kwargs.get('revision', 'main')}" - snapshot_ref = open(snapshot_file, "r").read().strip() - model_snapshot_path = f"{model_cache_path}/snapshots/{snapshot_ref}" - self.logger.info("\t+ Saving pretrained generation config") - self.generation_config.save_pretrained(save_directory=model_snapshot_path) + self.generation_config.save_pretrained(save_directory=model_snapshot_folder) def create_no_weights_model(self) -> None: self.no_weights_model = os.path.join(self.tmpdir.name, "no_weights_model") From 57680a3aae06595dd0b67ee315aed55b58080f36 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Mon, 16 Dec 2024 11:51:00 +0100 Subject: [PATCH 09/30] test no weights only --- tests/configs/_no_weights_.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/configs/_no_weights_.yaml b/tests/configs/_no_weights_.yaml index 31bbf2eb..bb0afa43 100644 --- a/tests/configs/_no_weights_.yaml +++ b/tests/configs/_no_weights_.yaml @@ -2,4 +2,4 @@ hydra: mode: MULTIRUN sweeper: params: - backend.no_weights: true,false + backend.no_weights: true From 31f96ba7d9d99dbe36a2dbe0f780ce607ae7fd3a Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Mon, 16 Dec 2024 11:56:40 +0100 Subject: [PATCH 10/30] fix --- optimum_benchmark/backends/py_txi/backend.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/optimum_benchmark/backends/py_txi/backend.py b/optimum_benchmark/backends/py_txi/backend.py index 81f1f6c3..e4c00f27 100644 --- a/optimum_benchmark/backends/py_txi/backend.py +++ b/optimum_benchmark/backends/py_txi/backend.py @@ -46,9 +46,14 @@ def download_pretrained_model(self) -> None: def create_no_weights_model(self) -> None: self.no_weights_model = os.path.join(self.tmpdir.name, "no_weights_model") - filename = os.path.join(self.no_weights_model, "model.safetensors") os.makedirs(self.no_weights_model, exist_ok=True) + if self.pretrained_config is not None: + self.pretrained_config.save_pretrained(save_directory=self.no_weights_model) + if self.pretrained_processor is not None: + self.pretrained_processor.save_pretrained(save_directory=self.no_weights_model) + + filename = os.path.join(self.no_weights_model, "model.safetensors") save_file(tensors=torch.nn.Linear(1, 1).state_dict(), filename=filename, metadata={"format": "pt"}) with fast_weights_init(): # unlike Transformers, TXI won't accept any missing tensors so we need to materialize the model @@ -59,11 +64,6 @@ def create_no_weights_model(self) -> None: del self.pretrained_model torch.cuda.empty_cache() - if self.pretrained_config is not None: - self.pretrained_config.save_pretrained(save_directory=self.no_weights_model) - if self.pretrained_processor is not None: - self.pretrained_processor.save_pretrained(save_directory=self.no_weights_model) - if self.config.task in TEXT_GENERATION_TASKS: self.generation_config.eos_token_id = None self.generation_config.pad_token_id = None @@ -78,11 +78,11 @@ def load_model_with_no_weights(self) -> None: def load_model_from_pretrained(self) -> None: if self.config.task in TEXT_GENERATION_TASKS: self.pretrained_model = TGI( - config=TGIConfig(self.config.model, **self.txi_kwargs, **self.tgi_kwargs), + config=TGIConfig(model_id=self.config.model, **self.txi_kwargs, **self.tgi_kwargs), ) elif self.config.task in TEXT_EMBEDDING_TASKS: self.pretrained_model = TEI( - config=TEIConfig(self.config.model, **self.txi_kwargs, **self.tei_kwargs), + config=TEIConfig(model_id=self.config.model, **self.txi_kwargs, **self.tei_kwargs), ) else: raise NotImplementedError(f"TXI does not support task {self.config.task}") From 8c8d073670a817f46b20590fd0ec24b63cf1fbd8 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Mon, 16 Dec 2024 12:02:57 +0100 Subject: [PATCH 11/30] test --- optimum_benchmark/backends/py_txi/backend.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/optimum_benchmark/backends/py_txi/backend.py b/optimum_benchmark/backends/py_txi/backend.py index e4c00f27..ba542a2f 100644 --- a/optimum_benchmark/backends/py_txi/backend.py +++ b/optimum_benchmark/backends/py_txi/backend.py @@ -5,7 +5,7 @@ import torch from huggingface_hub import snapshot_download from py_txi import TEI, TGI, TEIConfig, TGIConfig -from safetensors.torch import save_file +from safetensors.torch import save_file, save_model from ...task_utils import TEXT_EMBEDDING_TASKS, TEXT_GENERATION_TASKS from ..base import Backend @@ -60,7 +60,7 @@ def create_no_weights_model(self) -> None: self.pretrained_model = self.automodel_loader.from_pretrained( self.no_weights_model, **self.config.model_kwargs, device_map="auto", _fast_init=False ) - save_file(tensors=self.pretrained_model.state_dict(), filename=filename, metadata={"format": "pt"}) + self.pretrained_model.save_pretrained(save_directory=self.no_weights_model) del self.pretrained_model torch.cuda.empty_cache() From bc1c638f2251689a88a1d3f6077959db73f5ebd5 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Mon, 16 Dec 2024 12:32:35 +0100 Subject: [PATCH 12/30] test --- optimum_benchmark/backends/py_txi/backend.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/optimum_benchmark/backends/py_txi/backend.py b/optimum_benchmark/backends/py_txi/backend.py index ba542a2f..96a80b6c 100644 --- a/optimum_benchmark/backends/py_txi/backend.py +++ b/optimum_benchmark/backends/py_txi/backend.py @@ -48,13 +48,10 @@ def create_no_weights_model(self) -> None: self.no_weights_model = os.path.join(self.tmpdir.name, "no_weights_model") os.makedirs(self.no_weights_model, exist_ok=True) - if self.pretrained_config is not None: - self.pretrained_config.save_pretrained(save_directory=self.no_weights_model) - if self.pretrained_processor is not None: - self.pretrained_processor.save_pretrained(save_directory=self.no_weights_model) - filename = os.path.join(self.no_weights_model, "model.safetensors") save_file(tensors=torch.nn.Linear(1, 1).state_dict(), filename=filename, metadata={"format": "pt"}) + self.pretrained_processor.save_pretrained(save_directory=self.no_weights_model) + self.pretrained_config.save_pretrained(save_directory=self.no_weights_model) with fast_weights_init(): # unlike Transformers, TXI won't accept any missing tensors so we need to materialize the model self.pretrained_model = self.automodel_loader.from_pretrained( @@ -70,8 +67,8 @@ def create_no_weights_model(self) -> None: self.generation_config.save_pretrained(save_directory=self.no_weights_model) def load_model_with_no_weights(self) -> None: - self.config.volumes = {self.tmpdir.name: {"bind": self.tmpdir.name, "mode": "rw"}} - original_model, self.config.model = self.config.model, self.no_weights_model + self.config.volumes = {self.tmpdir.name: {"bind": "/var/no_weights_folder", "mode": "rw"}} + original_model, self.config.model = self.config.model, "/var/no_weights_folder/no_weights_model" self.load_model_from_pretrained() self.config.model = original_model From 95d45e590f39bdf14031743e98770302d01ed570 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Mon, 16 Dec 2024 12:52:04 +0100 Subject: [PATCH 13/30] test again --- optimum_benchmark/backends/py_txi/backend.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/optimum_benchmark/backends/py_txi/backend.py b/optimum_benchmark/backends/py_txi/backend.py index 96a80b6c..a3341bcf 100644 --- a/optimum_benchmark/backends/py_txi/backend.py +++ b/optimum_benchmark/backends/py_txi/backend.py @@ -67,8 +67,8 @@ def create_no_weights_model(self) -> None: self.generation_config.save_pretrained(save_directory=self.no_weights_model) def load_model_with_no_weights(self) -> None: - self.config.volumes = {self.tmpdir.name: {"bind": "/var/no_weights_folder", "mode": "rw"}} - original_model, self.config.model = self.config.model, "/var/no_weights_folder/no_weights_model" + self.config.volumes = {self.tmpdir.name: {"bind": "/data", "mode": "rw"}} + original_model, self.config.model = self.config.model, "/data/no_weights_model/" self.load_model_from_pretrained() self.config.model = original_model From 63ef40757baee4c5c82b48a6465fc1b76e814a92 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Mon, 16 Dec 2024 13:13:17 +0100 Subject: [PATCH 14/30] test --- optimum_benchmark/backends/py_txi/backend.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/optimum_benchmark/backends/py_txi/backend.py b/optimum_benchmark/backends/py_txi/backend.py index a3341bcf..a4c085eb 100644 --- a/optimum_benchmark/backends/py_txi/backend.py +++ b/optimum_benchmark/backends/py_txi/backend.py @@ -4,8 +4,9 @@ import torch from huggingface_hub import snapshot_download +from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE from py_txi import TEI, TGI, TEIConfig, TGIConfig -from safetensors.torch import save_file, save_model +from safetensors.torch import save_file from ...task_utils import TEXT_EMBEDDING_TASKS, TEXT_GENERATION_TASKS from ..base import Backend @@ -67,8 +68,11 @@ def create_no_weights_model(self) -> None: self.generation_config.save_pretrained(save_directory=self.no_weights_model) def load_model_with_no_weights(self) -> None: - self.config.volumes = {self.tmpdir.name: {"bind": "/data", "mode": "rw"}} - original_model, self.config.model = self.config.model, "/data/no_weights_model/" + self.config.volumes = { + HUGGINGFACE_HUB_CACHE: {"bind": "/data", "mode": "rw"}, + self.tmpdir.name: {"bind": "/no_weights_folder", "mode": "rw"}, + } + original_model, self.config.model = self.config.model, "/no_weights_folder/no_weights_model/" self.load_model_from_pretrained() self.config.model = original_model From 1f9fdd661f2dc9aa2ab258c35151fa13d6c992eb Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Mon, 16 Dec 2024 14:04:12 +0100 Subject: [PATCH 15/30] test --- optimum_benchmark/backends/py_txi/backend.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/optimum_benchmark/backends/py_txi/backend.py b/optimum_benchmark/backends/py_txi/backend.py index a4c085eb..7ee11b7e 100644 --- a/optimum_benchmark/backends/py_txi/backend.py +++ b/optimum_benchmark/backends/py_txi/backend.py @@ -38,7 +38,7 @@ def load(self) -> None: self.tmpdir.cleanup() def download_pretrained_model(self) -> None: - model_snapshot_folder = snapshot_download(self.config.model, self.config.model_kwargs) + model_snapshot_folder = snapshot_download(self.config.model, **self.config.model_kwargs) if self.config.task in TEXT_GENERATION_TASKS: self.generation_config.eos_token_id = None @@ -68,11 +68,8 @@ def create_no_weights_model(self) -> None: self.generation_config.save_pretrained(save_directory=self.no_weights_model) def load_model_with_no_weights(self) -> None: - self.config.volumes = { - HUGGINGFACE_HUB_CACHE: {"bind": "/data", "mode": "rw"}, - self.tmpdir.name: {"bind": "/no_weights_folder", "mode": "rw"}, - } - original_model, self.config.model = self.config.model, "/no_weights_folder/no_weights_model/" + self.config.volumes = {self.no_weights_model: {"bind": "/no_weights_model/", "mode": "rw"}} + original_model, self.config.model = self.config.model, "/no_weights_model/" self.load_model_from_pretrained() self.config.model = original_model From 2c8808023f6cd52922e120c1bf9abf5820c00460 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Mon, 16 Dec 2024 14:13:19 +0100 Subject: [PATCH 16/30] disable safe ser --- optimum_benchmark/backends/py_txi/backend.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/optimum_benchmark/backends/py_txi/backend.py b/optimum_benchmark/backends/py_txi/backend.py index 7ee11b7e..83dcd6cb 100644 --- a/optimum_benchmark/backends/py_txi/backend.py +++ b/optimum_benchmark/backends/py_txi/backend.py @@ -4,7 +4,6 @@ import torch from huggingface_hub import snapshot_download -from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE from py_txi import TEI, TGI, TEIConfig, TGIConfig from safetensors.torch import save_file @@ -58,7 +57,7 @@ def create_no_weights_model(self) -> None: self.pretrained_model = self.automodel_loader.from_pretrained( self.no_weights_model, **self.config.model_kwargs, device_map="auto", _fast_init=False ) - self.pretrained_model.save_pretrained(save_directory=self.no_weights_model) + self.pretrained_model.save_pretrained(save_directory=self.no_weights_model, safe_serialization=False) del self.pretrained_model torch.cuda.empty_cache() From 4dd46d1228ca3b66a7c99af4f1df9194c0143d8e Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Mon, 16 Dec 2024 14:31:54 +0100 Subject: [PATCH 17/30] test --- optimum_benchmark/backends/py_txi/backend.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/optimum_benchmark/backends/py_txi/backend.py b/optimum_benchmark/backends/py_txi/backend.py index 83dcd6cb..12d9d356 100644 --- a/optimum_benchmark/backends/py_txi/backend.py +++ b/optimum_benchmark/backends/py_txi/backend.py @@ -5,7 +5,7 @@ import torch from huggingface_hub import snapshot_download from py_txi import TEI, TGI, TEIConfig, TGIConfig -from safetensors.torch import save_file +from safetensors.torch import save_file, save_model from ...task_utils import TEXT_EMBEDDING_TASKS, TEXT_GENERATION_TASKS from ..base import Backend @@ -48,24 +48,24 @@ def create_no_weights_model(self) -> None: self.no_weights_model = os.path.join(self.tmpdir.name, "no_weights_model") os.makedirs(self.no_weights_model, exist_ok=True) + self.pretrained_config.save_pretrained(save_directory=self.no_weights_model) + self.pretrained_processor.save_pretrained(save_directory=self.no_weights_model) + if self.config.task in TEXT_GENERATION_TASKS: + self.generation_config.eos_token_id = None + self.generation_config.pad_token_id = None + self.generation_config.save_pretrained(save_directory=self.no_weights_model) + filename = os.path.join(self.no_weights_model, "model.safetensors") save_file(tensors=torch.nn.Linear(1, 1).state_dict(), filename=filename, metadata={"format": "pt"}) - self.pretrained_processor.save_pretrained(save_directory=self.no_weights_model) - self.pretrained_config.save_pretrained(save_directory=self.no_weights_model) with fast_weights_init(): # unlike Transformers, TXI won't accept any missing tensors so we need to materialize the model self.pretrained_model = self.automodel_loader.from_pretrained( self.no_weights_model, **self.config.model_kwargs, device_map="auto", _fast_init=False ) - self.pretrained_model.save_pretrained(save_directory=self.no_weights_model, safe_serialization=False) + save_model(tensors=self.pretrained_model, filename=filename, metadata={"format": "pt"}) del self.pretrained_model torch.cuda.empty_cache() - if self.config.task in TEXT_GENERATION_TASKS: - self.generation_config.eos_token_id = None - self.generation_config.pad_token_id = None - self.generation_config.save_pretrained(save_directory=self.no_weights_model) - def load_model_with_no_weights(self) -> None: self.config.volumes = {self.no_weights_model: {"bind": "/no_weights_model/", "mode": "rw"}} original_model, self.config.model = self.config.model, "/no_weights_model/" From 75945ac87e466cbe80bdc8e76346cc83d1aba596 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Mon, 16 Dec 2024 15:10:59 +0100 Subject: [PATCH 18/30] fix --- optimum_benchmark/backends/py_txi/backend.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/optimum_benchmark/backends/py_txi/backend.py b/optimum_benchmark/backends/py_txi/backend.py index 12d9d356..605abdf2 100644 --- a/optimum_benchmark/backends/py_txi/backend.py +++ b/optimum_benchmark/backends/py_txi/backend.py @@ -5,7 +5,7 @@ import torch from huggingface_hub import snapshot_download from py_txi import TEI, TGI, TEIConfig, TGIConfig -from safetensors.torch import save_file, save_model +from safetensors.torch import save_model from ...task_utils import TEXT_EMBEDDING_TASKS, TEXT_GENERATION_TASKS from ..base import Backend @@ -56,13 +56,13 @@ def create_no_weights_model(self) -> None: self.generation_config.save_pretrained(save_directory=self.no_weights_model) filename = os.path.join(self.no_weights_model, "model.safetensors") - save_file(tensors=torch.nn.Linear(1, 1).state_dict(), filename=filename, metadata={"format": "pt"}) + save_model(tensors=torch.nn.Linear(1, 1), filename=filename, metadata={"format": "pt"}) with fast_weights_init(): # unlike Transformers, TXI won't accept any missing tensors so we need to materialize the model self.pretrained_model = self.automodel_loader.from_pretrained( self.no_weights_model, **self.config.model_kwargs, device_map="auto", _fast_init=False ) - save_model(tensors=self.pretrained_model, filename=filename, metadata={"format": "pt"}) + save_model(model=self.pretrained_model, filename=filename, metadata={"format": "pt"}) del self.pretrained_model torch.cuda.empty_cache() From bd542899cedcea9facbc3974b7bf29280cede6c3 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Mon, 16 Dec 2024 15:13:06 +0100 Subject: [PATCH 19/30] faster req installation --- .github/workflows/test_cli_cpu_py_txi.yaml | 9 ++++++--- .github/workflows/test_cli_cuda_py_txi.yaml | 8 ++++++-- optimum_benchmark/backends/py_txi/backend.py | 2 +- 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/.github/workflows/test_cli_cpu_py_txi.yaml b/.github/workflows/test_cli_cpu_py_txi.yaml index 7b1946e7..06bd841d 100644 --- a/.github/workflows/test_cli_cpu_py_txi.yaml +++ b/.github/workflows/test_cli_cpu_py_txi.yaml @@ -43,9 +43,12 @@ jobs: - name: Install requirements run: | - pip install --upgrade pip - pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu - pip install -e .[testing,py-txi] git+https://github.com/IlyasMoutawwakil/py-txi.git + pip install uv + uv pip install --upgrade pip + uv pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu + uv pip install -e .[testing,py-txi] git+https://github.com/IlyasMoutawwakil/py-txi.git + env: + UV_SYSTEM_PYTHON: 1 - name: Run tests run: pytest tests/test_cli.py -s -k "cli and cpu and py_txi" diff --git a/.github/workflows/test_cli_cuda_py_txi.yaml b/.github/workflows/test_cli_cuda_py_txi.yaml index b8c50db0..a7fe9a51 100644 --- a/.github/workflows/test_cli_cuda_py_txi.yaml +++ b/.github/workflows/test_cli_cuda_py_txi.yaml @@ -44,8 +44,12 @@ jobs: - name: Install requirements run: | - pip install --upgrade pip - pip install -e .[testing,py-txi] git+https://github.com/IlyasMoutawwakil/py-txi.git + pip install uv + uv pip install --upgrade pip + uv pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu + uv pip install -e .[testing,py-txi] git+https://github.com/IlyasMoutawwakil/py-txi.git + env: + UV_SYSTEM_PYTHON: 1 - name: Run tests run: | diff --git a/optimum_benchmark/backends/py_txi/backend.py b/optimum_benchmark/backends/py_txi/backend.py index 605abdf2..0f52bc50 100644 --- a/optimum_benchmark/backends/py_txi/backend.py +++ b/optimum_benchmark/backends/py_txi/backend.py @@ -56,7 +56,7 @@ def create_no_weights_model(self) -> None: self.generation_config.save_pretrained(save_directory=self.no_weights_model) filename = os.path.join(self.no_weights_model, "model.safetensors") - save_model(tensors=torch.nn.Linear(1, 1), filename=filename, metadata={"format": "pt"}) + save_model(model=torch.nn.Linear(1, 1), filename=filename, metadata={"format": "pt"}) with fast_weights_init(): # unlike Transformers, TXI won't accept any missing tensors so we need to materialize the model self.pretrained_model = self.automodel_loader.from_pretrained( From 3b0138b30cb884475724932f27ab922b751dbee1 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Mon, 16 Dec 2024 15:23:29 +0100 Subject: [PATCH 20/30] use older tgi version --- examples/cuda_tgi_llama.yaml | 1 + tests/configs/_no_weights_.yaml | 2 +- tests/configs/cpu_inference_py_txi_gpt2.yaml | 1 + tests/configs/cuda_inference_py_txi_gpt2.yaml | 1 + 4 files changed, 4 insertions(+), 1 deletion(-) diff --git a/examples/cuda_tgi_llama.yaml b/examples/cuda_tgi_llama.yaml index a32060b1..a3d33af5 100644 --- a/examples/cuda_tgi_llama.yaml +++ b/examples/cuda_tgi_llama.yaml @@ -18,6 +18,7 @@ backend: cuda_graphs: 0 # remove for better perf but bigger memory footprint no_weights: true model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 + image: ghcr.io/huggingface/text-generation-inference:2.4.1 scenario: input_shapes: diff --git a/tests/configs/_no_weights_.yaml b/tests/configs/_no_weights_.yaml index bb0afa43..31bbf2eb 100644 --- a/tests/configs/_no_weights_.yaml +++ b/tests/configs/_no_weights_.yaml @@ -2,4 +2,4 @@ hydra: mode: MULTIRUN sweeper: params: - backend.no_weights: true + backend.no_weights: true,false diff --git a/tests/configs/cpu_inference_py_txi_gpt2.yaml b/tests/configs/cpu_inference_py_txi_gpt2.yaml index 82a522bd..23c18416 100644 --- a/tests/configs/cpu_inference_py_txi_gpt2.yaml +++ b/tests/configs/cpu_inference_py_txi_gpt2.yaml @@ -12,3 +12,4 @@ name: cpu_inference_py_txi_gpt2 backend: cuda_graphs: 0 + image: ghcr.io/huggingface/text-generation-inference:2.4.1 diff --git a/tests/configs/cuda_inference_py_txi_gpt2.yaml b/tests/configs/cuda_inference_py_txi_gpt2.yaml index d0d17dbc..75e20094 100644 --- a/tests/configs/cuda_inference_py_txi_gpt2.yaml +++ b/tests/configs/cuda_inference_py_txi_gpt2.yaml @@ -12,3 +12,4 @@ name: cuda_inference_py_txi_gpt2 backend: cuda_graphs: 0 + image: ghcr.io/huggingface/text-generation-inference:2.4.1 From a8c41596d75cafcda5f676bb38d1ce8ab150ac35 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Mon, 16 Dec 2024 17:18:26 +0100 Subject: [PATCH 21/30] disable no weights on tgi cuda for now --- examples/cuda_tgi_llama.yaml | 3 +-- optimum_benchmark/backends/py_txi/backend.py | 11 ++++++----- tests/configs/cpu_inference_py_txi_gpt2.yaml | 4 ---- tests/configs/cuda_inference_py_txi_gpt2.yaml | 5 ----- 4 files changed, 7 insertions(+), 16 deletions(-) diff --git a/examples/cuda_tgi_llama.yaml b/examples/cuda_tgi_llama.yaml index a3d33af5..16d2f5f0 100644 --- a/examples/cuda_tgi_llama.yaml +++ b/examples/cuda_tgi_llama.yaml @@ -16,9 +16,8 @@ backend: device: cuda device_ids: 0 cuda_graphs: 0 # remove for better perf but bigger memory footprint - no_weights: true + no_weights: false model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 - image: ghcr.io/huggingface/text-generation-inference:2.4.1 scenario: input_shapes: diff --git a/optimum_benchmark/backends/py_txi/backend.py b/optimum_benchmark/backends/py_txi/backend.py index 0f52bc50..00e1044d 100644 --- a/optimum_benchmark/backends/py_txi/backend.py +++ b/optimum_benchmark/backends/py_txi/backend.py @@ -46,16 +46,12 @@ def download_pretrained_model(self) -> None: def create_no_weights_model(self) -> None: self.no_weights_model = os.path.join(self.tmpdir.name, "no_weights_model") + filename = os.path.join(self.no_weights_model, "model.safetensors") os.makedirs(self.no_weights_model, exist_ok=True) self.pretrained_config.save_pretrained(save_directory=self.no_weights_model) self.pretrained_processor.save_pretrained(save_directory=self.no_weights_model) - if self.config.task in TEXT_GENERATION_TASKS: - self.generation_config.eos_token_id = None - self.generation_config.pad_token_id = None - self.generation_config.save_pretrained(save_directory=self.no_weights_model) - filename = os.path.join(self.no_weights_model, "model.safetensors") save_model(model=torch.nn.Linear(1, 1), filename=filename, metadata={"format": "pt"}) with fast_weights_init(): # unlike Transformers, TXI won't accept any missing tensors so we need to materialize the model @@ -66,6 +62,11 @@ def create_no_weights_model(self) -> None: del self.pretrained_model torch.cuda.empty_cache() + if self.config.task in TEXT_GENERATION_TASKS: + self.generation_config.eos_token_id = None + self.generation_config.pad_token_id = None + self.generation_config.save_pretrained(save_directory=self.no_weights_model) + def load_model_with_no_weights(self) -> None: self.config.volumes = {self.no_weights_model: {"bind": "/no_weights_model/", "mode": "rw"}} original_model, self.config.model = self.config.model, "/no_weights_model/" diff --git a/tests/configs/cpu_inference_py_txi_gpt2.yaml b/tests/configs/cpu_inference_py_txi_gpt2.yaml index 23c18416..1aef598e 100644 --- a/tests/configs/cpu_inference_py_txi_gpt2.yaml +++ b/tests/configs/cpu_inference_py_txi_gpt2.yaml @@ -9,7 +9,3 @@ defaults: - override backend: py-txi name: cpu_inference_py_txi_gpt2 - -backend: - cuda_graphs: 0 - image: ghcr.io/huggingface/text-generation-inference:2.4.1 diff --git a/tests/configs/cuda_inference_py_txi_gpt2.yaml b/tests/configs/cuda_inference_py_txi_gpt2.yaml index 75e20094..73a5c10a 100644 --- a/tests/configs/cuda_inference_py_txi_gpt2.yaml +++ b/tests/configs/cuda_inference_py_txi_gpt2.yaml @@ -3,13 +3,8 @@ defaults: - _base_ # inherits from base config - _cuda_ # inherits from cuda config - _inference_ # inherits from inference config - - _no_weights_ # inherits from no weights config - _gpt2_ # inherits from gpt2 config - _self_ # hydra 1.1 compatibility - override backend: py-txi name: cuda_inference_py_txi_gpt2 - -backend: - cuda_graphs: 0 - image: ghcr.io/huggingface/text-generation-inference:2.4.1 From 9c1cd0ca8e2cafa96ac4d71bd0622a62eb0b5a75 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Mon, 16 Dec 2024 19:10:34 +0100 Subject: [PATCH 22/30] test --- optimum_benchmark/backends/py_txi/backend.py | 25 ++++++++----------- tests/configs/cuda_inference_py_txi_gpt2.yaml | 1 + 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/optimum_benchmark/backends/py_txi/backend.py b/optimum_benchmark/backends/py_txi/backend.py index 00e1044d..912ea86c 100644 --- a/optimum_benchmark/backends/py_txi/backend.py +++ b/optimum_benchmark/backends/py_txi/backend.py @@ -1,9 +1,10 @@ import os +from pathlib import Path from tempfile import TemporaryDirectory from typing import Any, Dict, List import torch -from huggingface_hub import snapshot_download +from huggingface_hub import hf_hub_download, snapshot_download from py_txi import TEI, TGI, TEIConfig, TGIConfig from safetensors.torch import save_model @@ -45,33 +46,29 @@ def download_pretrained_model(self) -> None: self.generation_config.save_pretrained(save_directory=model_snapshot_folder) def create_no_weights_model(self) -> None: - self.no_weights_model = os.path.join(self.tmpdir.name, "no_weights_model") - filename = os.path.join(self.no_weights_model, "model.safetensors") - os.makedirs(self.no_weights_model, exist_ok=True) + model_path = Path(hf_hub_download(self.config.model, filename="config.json", cache_dir=self.tmpdir.name)).parent + save_model(model=torch.nn.Linear(1, 1), filename=model_path / "model.safetensors", metadata={"format": "pt"}) + self.pretrained_processor.save_pretrained(save_directory=model_path) + self.pretrained_config.save_pretrained(save_directory=model_path) - self.pretrained_config.save_pretrained(save_directory=self.no_weights_model) - self.pretrained_processor.save_pretrained(save_directory=self.no_weights_model) - - save_model(model=torch.nn.Linear(1, 1), filename=filename, metadata={"format": "pt"}) with fast_weights_init(): # unlike Transformers, TXI won't accept any missing tensors so we need to materialize the model self.pretrained_model = self.automodel_loader.from_pretrained( - self.no_weights_model, **self.config.model_kwargs, device_map="auto", _fast_init=False + model_path, **self.config.model_kwargs, device_map="auto", _fast_init=False ) - save_model(model=self.pretrained_model, filename=filename, metadata={"format": "pt"}) + save_model(model=self.pretrained_model, filename=model_path / "model.safetensors", metadata={"format": "pt"}) del self.pretrained_model torch.cuda.empty_cache() if self.config.task in TEXT_GENERATION_TASKS: self.generation_config.eos_token_id = None self.generation_config.pad_token_id = None - self.generation_config.save_pretrained(save_directory=self.no_weights_model) + self.generation_config.save_pretrained(save_directory=model_path) def load_model_with_no_weights(self) -> None: - self.config.volumes = {self.no_weights_model: {"bind": "/no_weights_model/", "mode": "rw"}} - original_model, self.config.model = self.config.model, "/no_weights_model/" + original_volumes, self.config.volumes = self.config.volumes, {self.tmpdir.name: {"bind": "/data", "mode": "rw"}} self.load_model_from_pretrained() - self.config.model = original_model + self.config.volumes = original_volumes def load_model_from_pretrained(self) -> None: if self.config.task in TEXT_GENERATION_TASKS: diff --git a/tests/configs/cuda_inference_py_txi_gpt2.yaml b/tests/configs/cuda_inference_py_txi_gpt2.yaml index 73a5c10a..1c93ac36 100644 --- a/tests/configs/cuda_inference_py_txi_gpt2.yaml +++ b/tests/configs/cuda_inference_py_txi_gpt2.yaml @@ -3,6 +3,7 @@ defaults: - _base_ # inherits from base config - _cuda_ # inherits from cuda config - _inference_ # inherits from inference config + - _no_weights_ # inherits from no weights config - _gpt2_ # inherits from gpt2 config - _self_ # hydra 1.1 compatibility - override backend: py-txi From 95ee4b455d6b9a554d4f446abfd3435d5eeb2324 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Mon, 16 Dec 2024 19:10:50 +0100 Subject: [PATCH 23/30] style --- optimum_benchmark/backends/py_txi/backend.py | 1 - 1 file changed, 1 deletion(-) diff --git a/optimum_benchmark/backends/py_txi/backend.py b/optimum_benchmark/backends/py_txi/backend.py index 912ea86c..e25e2def 100644 --- a/optimum_benchmark/backends/py_txi/backend.py +++ b/optimum_benchmark/backends/py_txi/backend.py @@ -1,4 +1,3 @@ -import os from pathlib import Path from tempfile import TemporaryDirectory from typing import Any, Dict, List From e17e220701eef4eb7f83265976187b99670b618f Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Mon, 16 Dec 2024 20:23:25 +0100 Subject: [PATCH 24/30] test --- optimum_benchmark/backends/py_txi/backend.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/optimum_benchmark/backends/py_txi/backend.py b/optimum_benchmark/backends/py_txi/backend.py index e25e2def..f357936b 100644 --- a/optimum_benchmark/backends/py_txi/backend.py +++ b/optimum_benchmark/backends/py_txi/backend.py @@ -1,6 +1,6 @@ from pathlib import Path from tempfile import TemporaryDirectory -from typing import Any, Dict, List +from typing import Any, Dict, List, Union import torch from huggingface_hub import hf_hub_download, snapshot_download @@ -15,6 +15,7 @@ class PyTXIBackend(Backend[PyTXIConfig]): NAME: str = "py-txi" + pretrained_model: Union[TEI, TGI] def __init__(self, config: PyTXIConfig) -> None: super().__init__(config) @@ -65,7 +66,10 @@ def create_no_weights_model(self) -> None: self.generation_config.save_pretrained(save_directory=model_path) def load_model_with_no_weights(self) -> None: - original_volumes, self.config.volumes = self.config.volumes, {self.tmpdir.name: {"bind": "/data", "mode": "rw"}} + original_volumes, self.config.volumes = ( + self.config.volumes, + {Path(self.tmpdir.name) / "hub": {"bind": "/data", "mode": "rw"}}, + ) self.load_model_from_pretrained() self.config.volumes = original_volumes From 307df8112f0e03ce5269e8f05ae8edd6fd215c96 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Mon, 16 Dec 2024 20:28:04 +0100 Subject: [PATCH 25/30] test --- optimum_benchmark/backends/py_txi/backend.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/optimum_benchmark/backends/py_txi/backend.py b/optimum_benchmark/backends/py_txi/backend.py index f357936b..21237136 100644 --- a/optimum_benchmark/backends/py_txi/backend.py +++ b/optimum_benchmark/backends/py_txi/backend.py @@ -1,3 +1,4 @@ +import os from pathlib import Path from tempfile import TemporaryDirectory from typing import Any, Dict, List, Union @@ -68,7 +69,7 @@ def create_no_weights_model(self) -> None: def load_model_with_no_weights(self) -> None: original_volumes, self.config.volumes = ( self.config.volumes, - {Path(self.tmpdir.name) / "hub": {"bind": "/data", "mode": "rw"}}, + {self.tmpdir.name: {"bind": "/data/hub/", "mode": "rw"}}, ) self.load_model_from_pretrained() self.config.volumes = original_volumes From 7336fce60583d806f9a3d7a43c644d594e740e82 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Mon, 16 Dec 2024 20:38:03 +0100 Subject: [PATCH 26/30] catch errors --- optimum_benchmark/backends/py_txi/backend.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/optimum_benchmark/backends/py_txi/backend.py b/optimum_benchmark/backends/py_txi/backend.py index 21237136..e7e298f4 100644 --- a/optimum_benchmark/backends/py_txi/backend.py +++ b/optimum_benchmark/backends/py_txi/backend.py @@ -1,4 +1,4 @@ -import os +import shutil from pathlib import Path from tempfile import TemporaryDirectory from typing import Any, Dict, List, Union @@ -36,7 +36,10 @@ def load(self) -> None: self.logger.info("\t+ Loading pretrained model") self.load_model_from_pretrained() - self.tmpdir.cleanup() + try: + self.tmpdir.cleanup() + except Exception: + shutil.rmtree(self.tmpdir.name) def download_pretrained_model(self) -> None: model_snapshot_folder = snapshot_download(self.config.model, **self.config.model_kwargs) @@ -49,6 +52,7 @@ def download_pretrained_model(self) -> None: def create_no_weights_model(self) -> None: model_path = Path(hf_hub_download(self.config.model, filename="config.json", cache_dir=self.tmpdir.name)).parent save_model(model=torch.nn.Linear(1, 1), filename=model_path / "model.safetensors", metadata={"format": "pt"}) + self.pretrained_processor.save_pretrained(save_directory=model_path) self.pretrained_config.save_pretrained(save_directory=model_path) @@ -57,6 +61,7 @@ def create_no_weights_model(self) -> None: self.pretrained_model = self.automodel_loader.from_pretrained( model_path, **self.config.model_kwargs, device_map="auto", _fast_init=False ) + save_model(model=self.pretrained_model, filename=model_path / "model.safetensors", metadata={"format": "pt"}) del self.pretrained_model torch.cuda.empty_cache() From 942f0e07acdd70b2838284d83fb75c838ab64c46 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Mon, 16 Dec 2024 20:42:20 +0100 Subject: [PATCH 27/30] ignore errors --- optimum_benchmark/backends/py_txi/backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum_benchmark/backends/py_txi/backend.py b/optimum_benchmark/backends/py_txi/backend.py index e7e298f4..3acd2e42 100644 --- a/optimum_benchmark/backends/py_txi/backend.py +++ b/optimum_benchmark/backends/py_txi/backend.py @@ -39,7 +39,7 @@ def load(self) -> None: try: self.tmpdir.cleanup() except Exception: - shutil.rmtree(self.tmpdir.name) + shutil.rmtree(self.tmpdir.name, ignore_errors=True) def download_pretrained_model(self) -> None: model_snapshot_folder = snapshot_download(self.config.model, **self.config.model_kwargs) From 9d5cf6cca0449996de328e7b2354014b53bec4a2 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Mon, 16 Dec 2024 21:12:06 +0100 Subject: [PATCH 28/30] test --- optimum_benchmark/backends/py_txi/backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum_benchmark/backends/py_txi/backend.py b/optimum_benchmark/backends/py_txi/backend.py index 3acd2e42..014af25f 100644 --- a/optimum_benchmark/backends/py_txi/backend.py +++ b/optimum_benchmark/backends/py_txi/backend.py @@ -74,7 +74,7 @@ def create_no_weights_model(self) -> None: def load_model_with_no_weights(self) -> None: original_volumes, self.config.volumes = ( self.config.volumes, - {self.tmpdir.name: {"bind": "/data/hub/", "mode": "rw"}}, + {self.tmpdir.name: {"bind": "/data", "mode": "rw"}}, ) self.load_model_from_pretrained() self.config.volumes = original_volumes From 2e76c97470bebb9c95b330b7465c6e87b98cdc4a Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Tue, 17 Dec 2024 09:37:47 +0100 Subject: [PATCH 29/30] test --- examples/cuda_tgi_llama.yaml | 2 +- optimum_benchmark/backends/py_txi/backend.py | 11 +++++------ 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/examples/cuda_tgi_llama.yaml b/examples/cuda_tgi_llama.yaml index 16d2f5f0..a32060b1 100644 --- a/examples/cuda_tgi_llama.yaml +++ b/examples/cuda_tgi_llama.yaml @@ -16,7 +16,7 @@ backend: device: cuda device_ids: 0 cuda_graphs: 0 # remove for better perf but bigger memory footprint - no_weights: false + no_weights: true model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 scenario: diff --git a/optimum_benchmark/backends/py_txi/backend.py b/optimum_benchmark/backends/py_txi/backend.py index 014af25f..55aecab9 100644 --- a/optimum_benchmark/backends/py_txi/backend.py +++ b/optimum_benchmark/backends/py_txi/backend.py @@ -59,7 +59,10 @@ def create_no_weights_model(self) -> None: with fast_weights_init(): # unlike Transformers, TXI won't accept any missing tensors so we need to materialize the model self.pretrained_model = self.automodel_loader.from_pretrained( - model_path, **self.config.model_kwargs, device_map="auto", _fast_init=False + model_path, + _fast_init=False, + device_map="auto", + **self.config.model_kwargs, ) save_model(model=self.pretrained_model, filename=model_path / "model.safetensors", metadata={"format": "pt"}) @@ -72,12 +75,8 @@ def create_no_weights_model(self) -> None: self.generation_config.save_pretrained(save_directory=model_path) def load_model_with_no_weights(self) -> None: - original_volumes, self.config.volumes = ( - self.config.volumes, - {self.tmpdir.name: {"bind": "/data", "mode": "rw"}}, - ) + self.config.volumes = {self.tmpdir.name: {"bind": "/data", "mode": "rw"}} self.load_model_from_pretrained() - self.config.volumes = original_volumes def load_model_from_pretrained(self) -> None: if self.config.task in TEXT_GENERATION_TASKS: From e29697953f19c698f36b329569392c91c5783cc6 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Tue, 17 Dec 2024 10:26:50 +0100 Subject: [PATCH 30/30] update readme --- README.md | 18 ++++++------------ examples/cuda_pytorch_bert.yaml | 2 +- 2 files changed, 7 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 6358b341..9203b778 100644 --- a/README.md +++ b/README.md @@ -50,7 +50,6 @@ Optimum-Benchmark is continuously and intensively tested on a variety of devices [![CLI_CPU_IPEX](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_ipex.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_ipex.yaml) [![CLI_CPU_LLAMA_CPP](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_llama_cpp.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_llama_cpp.yaml) -[![CLI_CPU_NEURAL_COMPRESSOR](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_neural_compressor.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_neural_compressor.yaml) [![CLI_CPU_ONNXRUNTIME](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_onnxruntime.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_onnxruntime.yaml) [![CLI_CPU_OPENVINO](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_openvino.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_openvino.yaml) [![CLI_CPU_PYTORCH](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_pytorch.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_pytorch.yaml) @@ -61,7 +60,6 @@ Optimum-Benchmark is continuously and intensively tested on a variety of devices [![CLI_CUDA_TENSORRT_LLM](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cuda_tensorrt_llm.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cuda_tensorrt_llm.yaml) [![CLI_CUDA_TORCH_ORT](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cuda_torch_ort.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cuda_torch_ort.yaml) [![CLI_CUDA_VLLM](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cuda_vllm.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cuda_vllm.yaml) -[![CLI_ENERGY_STAR](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_energy_star.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_energy_star.yaml) [![CLI_MISC](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_misc.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_misc.yaml) [![CLI_ROCM_PYTORCH](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_rocm_pytorch.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_rocm_pytorch.yaml) @@ -100,10 +98,9 @@ Depending on the backends you want to use, you can install `optimum-benchmark` w - OnnxRuntime: `pip install optimum-benchmark[onnxruntime]` - TensorRT-LLM: `pip install optimum-benchmark[tensorrt-llm]` - OnnxRuntime-GPU: `pip install optimum-benchmark[onnxruntime-gpu]` -- Neural Compressor: `pip install optimum-benchmark[neural-compressor]` -- Py-TXI: `pip install optimum-benchmark[py-txi]` -- IPEX: `pip install optimum-benchmark[ipex]` +- Py-TXI (TGI & TEI): `pip install optimum-benchmark[py-txi]` - vLLM: `pip install optimum-benchmark[vllm]` +- IPEX: `pip install optimum-benchmark[ipex]` We also support the following extra extra dependencies: @@ -144,9 +141,6 @@ if __name__ == "__main__": ) benchmark_report = Benchmark.launch(benchmark_config) - # log the benchmark in terminal - benchmark_report.log() # or print(benchmark_report) - # convert artifacts to a dictionary or dataframe benchmark_config.to_dict() # or benchmark_config.to_dataframe() @@ -175,15 +169,17 @@ If you're on VSCode, you can hover over the configuration classes to see the ava You can also run a benchmark using the command line by specifying the configuration directory and the configuration name. Both arguments are mandatory for [`hydra`](https://hydra.cc/). `--config-dir` is the directory where the configuration files are stored and `--config-name` is the name of the configuration file without its `.yaml` extension. ```bash -optimum-benchmark --config-dir examples/ --config-name pytorch_bert +optimum-benchmark --config-dir examples/ --config-name cuda_pytorch_bert ``` -This will run the benchmark using the configuration in [`examples/pytorch_bert.yaml`](examples/pytorch_bert.yaml) and store the results in `runs/pytorch_bert`. +This will run the benchmark using the configuration in [`examples/cuda_pytorch_bert.yaml`](examples/cuda_pytorch_bert.yaml) and store the results in `runs/cuda_pytorch_bert`. The resulting files are : - `benchmark_config.json` which contains the configuration used for the benchmark, including the backend, launcher, scenario and the environment in which the benchmark was run. - `benchmark_report.json` which contains a full report of the benchmark's results, like latency measurements, memory usage, energy consumption, etc. +- `benchmark_report.txt` which contains a detailed report of the benchmark's results, in the same format they were logged. +- `benchmark_report.md` which contains a detailed report of the benchmark's results, in markdown format. - `benchmark.json` contains both the report and the configuration in a single file. - `benchmark.log` contains the logs of the benchmark run. @@ -309,9 +305,7 @@ For more information on the features of each backend, you can check their respec - [PyTorchConfig](optimum_benchmark/backends/pytorch/config.py) - [ORTConfig](optimum_benchmark/backends/onnxruntime/config.py) - [TorchORTConfig](optimum_benchmark/backends/torch_ort/config.py) -- [LLMSwarmConfig](optimum_benchmark/backends/llm_swarm/config.py) - [TRTLLMConfig](optimum_benchmark/backends/tensorrt_llm/config.py) -- [INCConfig](optimum_benchmark/backends/neural_compressor/config.py) diff --git a/examples/cuda_pytorch_bert.yaml b/examples/cuda_pytorch_bert.yaml index 8ab9b5cb..195e8a02 100644 --- a/examples/cuda_pytorch_bert.yaml +++ b/examples/cuda_pytorch_bert.yaml @@ -6,7 +6,7 @@ defaults: - _base_ - _self_ -name: pytorch_bert +name: cuda_pytorch_bert launcher: device_isolation: true