From 2c8655a1db51262d00c3ecc0a4ee6bf6a797b6be Mon Sep 17 00:00:00 2001 From: Luis Montero Date: Wed, 2 Aug 2023 11:04:03 +0200 Subject: [PATCH] chore: implement hybrid model demo with GPT-2 --- .../ml/deployment/fhe_client_server.py | 25 +- src/concrete/ml/onnx/convert.py | 61 +++- src/concrete/ml/torch/compile.py | 17 +- src/concrete/ml/torch/hybrid_model.py | 288 +++++++++++++++--- tests/torch/test_hybrid_converter.py | 3 +- use_case_examples/hybrid_model/README.md | 14 + use_case_examples/hybrid_model/compile.sh | 6 + .../hybrid_model/compile_hybrid_llm.py | 140 +++++++++ .../hybrid_model/infer_hybrid_llm_generate.py | 89 ++++++ .../hybrid_model/load_and_analyze_data.py | 32 ++ .../hybrid_model/requirements.txt | 3 + use_case_examples/hybrid_model/serve.sh | 13 + use_case_examples/hybrid_model/serve_model.py | 193 ++++++++++++ use_case_examples/llm/QGPT2Evaluate.ipynb | 2 +- 14 files changed, 809 insertions(+), 77 deletions(-) create mode 100644 use_case_examples/hybrid_model/README.md create mode 100644 use_case_examples/hybrid_model/compile.sh create mode 100644 use_case_examples/hybrid_model/compile_hybrid_llm.py create mode 100644 use_case_examples/hybrid_model/infer_hybrid_llm_generate.py create mode 100644 use_case_examples/hybrid_model/load_and_analyze_data.py create mode 100644 use_case_examples/hybrid_model/requirements.txt create mode 100644 use_case_examples/hybrid_model/serve.sh create mode 100644 use_case_examples/hybrid_model/serve_model.py diff --git a/src/concrete/ml/deployment/fhe_client_server.py b/src/concrete/ml/deployment/fhe_client_server.py index 8feb481dd3..27a873bf79 100644 --- a/src/concrete/ml/deployment/fhe_client_server.py +++ b/src/concrete/ml/deployment/fhe_client_server.py @@ -18,11 +18,11 @@ try: # 3.8 and above # pylint: disable-next=no-name-in-module - from importlib.metadata import version + from importlib.metadata import PackageNotFoundError, version except ImportError: # pragma: no cover # 3.7 and below # pylint: disable-next=no-name-in-module - from importlib_metadata import version + from importlib_metadata import PackageNotFoundError, version class FHEModelServer: @@ -54,11 +54,14 @@ def load(self): versions = json.load(file) errors = [] - packages_to_check = {"concrete-python"} + packages_to_check = {"concrete-python", "concrete-ml"} for package_name, package_version in versions.items(): if package_name not in packages_to_check: continue - current_version = version(package_name) + if package_name == "concrete-ml": + current_version = CML_VERSION + else: + current_version = version(package_name) if package_version != current_version: # pragma: no cover errors.append((package_name, package_version, current_version)) if errors: # pragma: no cover @@ -190,13 +193,10 @@ def save(self, via_mlir: bool = False): # Add versions versions_path = Path(self.path_dir).joinpath("versions.json") versions = { - package_name: version(package_name) - for package_name in ["concrete-ml", "concrete-python"] + "concrete-python": version("concrete-python"), + "concrete-ml": CML_VERSION, + "python": f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}", } - versions[ - "python" - ] = f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}" - with open(versions_path, "w", encoding="utf-8") as file: json.dump(fp=file, obj=versions) @@ -255,7 +255,10 @@ def load(self): # pylint: disable=no-value-for-parameter for package_name, package_version in versions.items(): if package_name not in packages_to_check: continue - current_version = version(package_name) + if package_name == "concrete-ml": + current_version = CML_VERSION + else: + current_version = version(package_name) if package_version != current_version: # pragma: no cover errors.append((package_name, package_version, current_version)) if errors: # pragma: no cover diff --git a/src/concrete/ml/onnx/convert.py b/src/concrete/ml/onnx/convert.py index d1e2fcd0e8..b10cf74d70 100644 --- a/src/concrete/ml/onnx/convert.py +++ b/src/concrete/ml/onnx/convert.py @@ -2,10 +2,11 @@ import tempfile from pathlib import Path -from typing import Callable, Optional, Tuple, Union +from typing import Callable, Tuple, Union import numpy import onnx +import onnxoptimizer import torch from onnx import checker @@ -17,8 +18,8 @@ def get_equivalent_numpy_forward_and_onnx_model( torch_module: torch.nn.Module, dummy_input: Union[torch.Tensor, Tuple[torch.Tensor, ...]], - output_onnx_file: Optional[Union[Path, str]] = None, -) -> Tuple[Callable[..., Tuple[numpy.ndarray, ...]], onnx.GraphProto]: + output_onnx_file: Union[None, Path, str] = None, +) -> Tuple[Callable[..., Tuple[numpy.ndarray, ...]], onnx.ModelProto]: """Get the numpy equivalent forward of the provided torch Module. Args: @@ -34,7 +35,6 @@ def get_equivalent_numpy_forward_and_onnx_model( execute the equivalent numpy code to the passed torch_module and the generated ONNX model. """ - output_onnx_file_path = Path( tempfile.mkstemp(suffix=".onnx")[1] if output_onnx_file is None else output_onnx_file ) @@ -47,7 +47,58 @@ def get_equivalent_numpy_forward_and_onnx_model( opset_version=OPSET_VERSION_FOR_ONNX_EXPORT, ) equivalent_onnx_model = onnx.load_model(str(output_onnx_file_path)) - + # List of all currently supported onnx passes + # onnx_passes = [ + # 'adjust_add', + # 'rename_input_output', + # 'set_unique_name_for_nodes', + # 'nop', + # 'eliminate_nop_cast', + # 'eliminate_nop_dropout', + # 'eliminate_nop_flatten', + # 'extract_constant_to_initializer', + # 'eliminate_if_with_const_cond', + # 'eliminate_nop_monotone_argmax', + # 'eliminate_nop_pad', + # 'eliminate_nop_concat', + # 'eliminate_nop_split', + # 'eliminate_nop_expand', + # 'eliminate_shape_gather', + # 'eliminate_slice_after_shape', + # 'eliminate_nop_transpose', + # 'fuse_add_bias_into_conv', + # 'fuse_bn_into_conv', + # 'fuse_consecutive_concats', + # 'fuse_consecutive_log_softmax', + # 'fuse_consecutive_reduce_unsqueeze', + # 'fuse_consecutive_squeezes', + # 'fuse_consecutive_transposes', + # 'fuse_matmul_add_bias_into_gemm', + # 'fuse_pad_into_conv', + # 'fuse_pad_into_pool', + # 'fuse_transpose_into_gemm', + # 'replace_einsum_with_matmul', + # 'lift_lexical_references', + # 'split_init', + # 'split_predict', + # 'fuse_concat_into_reshape', + # 'eliminate_nop_reshape', + # 'eliminate_nop_with_unit', + # 'eliminate_common_subexpression', + # 'fuse_qkv', + # 'fuse_consecutive_unsqueezes', + # 'eliminate_deadend', + # 'eliminate_identity', + # 'eliminate_shape_op', + # 'fuse_consecutive_slices', + # 'eliminate_unused_initializer', + # 'eliminate_duplicate_initializer', + # 'adjust_slice_and_matmul' + # ] + onnx_passes = ["fuse_matmul_add_bias_into_gemm"] + equivalent_onnx_model = onnxoptimizer.optimize(equivalent_onnx_model, onnx_passes) + with output_onnx_file_path.open("wb") as file: + file.write(equivalent_onnx_model.SerializeToString()) checker.check_model(equivalent_onnx_model) # Remove the tempfile if we used one diff --git a/src/concrete/ml/torch/compile.py b/src/concrete/ml/torch/compile.py index 0457d8d4ba..ba3c1ecedd 100644 --- a/src/concrete/ml/torch/compile.py +++ b/src/concrete/ml/torch/compile.py @@ -3,7 +3,7 @@ import tempfile import warnings from pathlib import Path -from typing import Optional, Tuple, Union +from typing import Dict, Optional, Tuple, Union import numpy import onnx @@ -55,7 +55,7 @@ def build_quantized_module( model: Union[torch.nn.Module, onnx.ModelProto], torch_inputset: Dataset, import_qat: bool = False, - n_bits=MAX_BITWIDTH_BACKWARD_COMPATIBLE, + n_bits: Union[int, Dict[str, int]] = MAX_BITWIDTH_BACKWARD_COMPATIBLE, rounding_threshold_bits: Optional[int] = None, ) -> QuantizedModule: """Build a quantized module from a Torch or ONNX model. @@ -81,12 +81,9 @@ def build_quantized_module( convert_torch_tensor_or_numpy_array_to_numpy_array(val) for val in to_tuple(torch_inputset) ) - # Tracing needs to be done with the batch size of 1 since we compile our models to FHE with - # this batch size. The input set contains many examples, to determine a representative - # bit-width, but for tracing we only take a single one. We need the ONNX tracing batch size to - # match the batch size during FHE inference which can only be 1 for the moment. + # No batch dimension (i.e. 0 instead of [0]) because else GEMM onnx pass is not applied dummy_input_for_tracing = tuple( - torch.from_numpy(val[[0], ::]).float() for val in inputset_as_numpy_tuple + torch.from_numpy(val[0, ::]).float() for val in inputset_as_numpy_tuple ) # Create corresponding numpy model @@ -113,7 +110,7 @@ def _compile_torch_or_onnx_model( configuration: Optional[Configuration] = None, artifacts: Optional[DebugArtifacts] = None, show_mlir: bool = False, - n_bits=MAX_BITWIDTH_BACKWARD_COMPATIBLE, + n_bits: Union[int, Dict[str, int]] = MAX_BITWIDTH_BACKWARD_COMPATIBLE, rounding_threshold_bits: Optional[int] = None, p_error: Optional[float] = None, global_p_error: Optional[float] = None, @@ -272,7 +269,7 @@ def compile_onnx_model( configuration: Optional[Configuration] = None, artifacts: Optional[DebugArtifacts] = None, show_mlir: bool = False, - n_bits=MAX_BITWIDTH_BACKWARD_COMPATIBLE, + n_bits: Union[int, Dict] = MAX_BITWIDTH_BACKWARD_COMPATIBLE, rounding_threshold_bits: Optional[int] = None, p_error: Optional[float] = None, global_p_error: Optional[float] = None, @@ -340,7 +337,7 @@ def compile_brevitas_qat_model( rounding_threshold_bits: Optional[int] = None, p_error: Optional[float] = None, global_p_error: Optional[float] = None, - output_onnx_file: Union[Path, str] = None, + output_onnx_file: Union[None, Path, str] = None, verbose: bool = False, ) -> QuantizedModule: """Compile a Brevitas Quantization Aware Training model. diff --git a/src/concrete/ml/torch/hybrid_model.py b/src/concrete/ml/torch/hybrid_model.py index d6c473ca55..f59185e5b1 100644 --- a/src/concrete/ml/torch/hybrid_model.py +++ b/src/concrete/ml/torch/hybrid_model.py @@ -1,16 +1,42 @@ """Implement the conversion of a torch model to a hybrid fhe/torch inference.""" +import ast +import enum +import io +import sys +import time import uuid from pathlib import Path -from typing import List, Optional, Union +from typing import Dict, List, Optional, Tuple, Union +import numpy +import requests import torch from concrete.fhe import Configuration from torch import nn from transformers import Conv1D from ..deployment.fhe_client_server import FHEModelClient, FHEModelDev -from .compile import compile_torch_model +from .compile import QuantizedModule, compile_torch_model + + +class FHEMode(enum.Enum): + disable = "disable" # Use torch weights + remote = "remote" # Use remote FHE server + simulate = "simulate" # Use FHE simulation + calibrate = "calibrate" # Use calibration (to run before FHE compilation) + + +def tuple_to_underscore_str(tup: Tuple) -> str: + """Converts a tuple to a string representation. + + Args: + tup (Tuple): a tuple to change into string representation + + Returns: + string: a string representing the tuple + """ + return repr(tup).replace("(", "p_").replace(")", "_p").replace(", ", "_") # FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/3858 @@ -58,30 +84,98 @@ class RemoteModule(nn.Module): def __init__( self, - module=None, - server_remote_address=None, + module: Optional[nn.Module] = None, + server_remote_address: Optional[str] = None, + module_name: Optional[str] = None, + model_name: Optional[str] = None, + verbose: int = 0, ): super().__init__() - self.private_module = module - self.server_remote_address = server_remote_address - self.calibration_data = [] + self.private_module: Optional[nn.Module] = module + self.server_remote_address: Optional[str] = server_remote_address + self.calibration_data: List = [] self.uid = str(uuid.uuid4()) - self.private_q_module = None - self.fhe_local_mode = "disable" - self.client: Optional[FHEModelClient] = None - self.path_to_keys = None - self.path_to_client = None - - def init_fhe_client(self, path_to_client: str, path_to_keys: str): + self.private_q_module: Optional[QuantizedModule] = None + # TODO: figure out if this is good + self.fhe_local_mode: FHEMode = FHEMode.calibrate + self.clients: Dict[str, Tuple[str, FHEModelClient]] = {} + self.path_to_keys: Optional[Path] = None + self.path_to_clients: Optional[Path] = None + self.module_name: Optional[str] = module_name + self.model_name: Optional[str] = model_name + self.verbose = verbose + + def init_fhe_client( + self, path_to_client: Optional[Path] = None, path_to_keys: Optional[Path] = None + ): """Set the clients keys. Args: path_to_client (str): Path where the client.zip is located. path_to_keys (str): Path where keys are located. """ - # TODO: here we need to load fhe client.zip with FHEModelClient. - # Either by getting it from the server with the self.uid or - # directly getting it when downloading the model from HF. + self.path_to_clients = path_to_client + if self.path_to_clients is None: + self.path_to_clients = Path() / "clients" + self.path_to_clients.mkdir(exist_ok=True) + self.path_to_keys = path_to_keys + if self.path_to_keys is None: + self.path_to_keys = Path() / "keys" + self.path_to_keys.mkdir(exist_ok=True) + + assert self.module_name is not None + shapes_response = requests.get( + f"{self.server_remote_address}/list_shapes", + data={"module_name": self.module_name, "model_name": self.model_name}, + ) + if shapes_response.status_code != 200: + # Add link to request content + raise ValueError( + f"Couldn't get shapes from server:\n{shapes_response.content.decode('utf-8')}" + ) + shapes = shapes_response.json() + for shape in shapes: + client_response = requests.get( + f"{self.server_remote_address}/get_client", + data={ + "module_name": self.module_name, + "model_name": self.model_name, + "input_shape": shape, + }, + ) + if client_response.status_code != 200: + # Add link to request content + raise ValueError( + f"Couldn't get client from server:\n{client_response.content.decode('utf-8')}" + ) + path_to_client = self.path_to_clients / tuple_to_underscore_str(ast.literal_eval(shape)) + path_to_client.mkdir(exist_ok=True) + with open(path_to_client / "client.zip", "wb") as file: + file.write(client_response.content) + # Create the client + client = FHEModelClient( + path_dir=str(path_to_client.resolve()), key_dir=str(self.path_to_keys.resolve()) + ) + # The client first need to create the private and evaluation keys. + client.generate_private_and_evaluation_keys() + # Get the serialized evaluation keys + serialized_evaluation_keys = client.get_serialized_evaluation_keys() + if self.verbose: + print(f"Evaluation keys size: {len(serialized_evaluation_keys) / (10**6):.2f} MB") + assert isinstance(serialized_evaluation_keys, bytes) + assert self.module_name is not None + response = requests.post( + f"{self.server_remote_address}/add_key", + data={ + "module_name": self.module_name, + "model_name": self.model_name, + "input_shape": shape, + }, + files={"key": io.BytesIO(initial_bytes=serialized_evaluation_keys)}, + ) + assert response.status_code == 200, response.content.decode("utf-8") + uid = response.json()["uid"] + self.clients[shape] = (uid, client) def forward(self, x: torch.Tensor) -> torch.Tensor: """Forward pass of the remote module. @@ -91,20 +185,40 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: Returns: (torch.Tensor): The output tensor. + + Raises: + ValueError: if fhe_mode is not supported """ - if self.fhe_local_mode != "disable": - # for mypy - assert self.private_module is not None + # - disable: torch module + # - remote: client-server + # - simulate: compiled simulation + # - calibrate: calibration + + if self.fhe_local_mode not in {FHEMode.disable, FHEMode.calibrate, FHEMode.remote, None}: + # Using quantized module assert self.private_q_module is not None - y = self.private_q_module.forward(x.detach().numpy(), fhe=self.fhe_local_mode) + y = self.private_q_module.forward(x.detach().numpy(), fhe=self.fhe_local_mode.value) + y = torch.Tensor(y) + elif self.fhe_local_mode == FHEMode.disable: + # Calling torch + assert self.private_module is not None + y = self.private_module.forward( + x.detach(), + ) y = torch.Tensor(y) - elif self.private_module is not None: + elif self.fhe_local_mode == FHEMode.calibrate: + # Calling torch + gathering calibration data + assert self.private_module is not None if isinstance(x, torch.Tensor): self.calibration_data.append(x.detach()) y = self.private_module(x) # TODO: https://github.com/zama-ai/concrete-ml-internal/issues/3869 - # else: - # y = self.remote_call(x) + elif self.fhe_local_mode == FHEMode.remote: + # Remote call + y = self.remote_call(x) + else: + # Shouldn't happen + raise ValueError(f"{self.fhe_local_mode} is not recognized") return y def remote_call(self, x: torch.Tensor): @@ -115,6 +229,49 @@ def remote_call(self, x: torch.Tensor): """ # TODO: https://github.com/zama-ai/concrete-ml-internal/issues/3869 # implement server call and client initialization + base_device = x.device + x = x.to(device="cpu") + inferences = [] + for index in range(len(x)): + clear_input = x[[index], :].detach().numpy() + input_shape = tuple(clear_input.shape) + repr_input_shape = str(input_shape[1:]) + assert isinstance(clear_input, numpy.ndarray) + assert repr_input_shape in self.clients + key_id, client = self.clients[repr_input_shape] + assert client is not None + encrypted_input = client.quantize_encrypt_serialize(clear_input) + assert isinstance(encrypted_input, bytes) + if self.verbose: + print( + f"Encrypted input size: {sys.getsizeof(encrypted_input) / 1024 / 1024:.2f} MB" + ) + start = time.time() + assert self.module_name is not None + if self.verbose: + print("Infering ...") + inference_query = requests.post( + f"{self.server_remote_address}/compute", + files={ + "model_input": io.BytesIO(encrypted_input), + }, + data={ + "uid": key_id, + "module_name": self.module_name, + "model_name": self.model_name, + "input_shape": repr_input_shape, + }, + stream=True, + ) + end = time.time() + if self.verbose: + print(f"Inference done in {end - start} seconds") + # Unpack the results + assert inference_query.status_code == 200, inference_query.content.decode("utf-8") + encrypted_result = inference_query.content + decrypted_prediction = client.deserialize_decrypt_dequantize(encrypted_result)[0] + inferences.append(decrypted_prediction) + return torch.Tensor(numpy.array(inferences)).to(device=base_device) class HybridFHEModel: @@ -125,33 +282,45 @@ def __init__( model: nn.Module, module_names: Union[str, List[str]], server_remote_address=None, + model_name: str = "model", + verbose: int = 0, ): self.model = model self.module_names = [module_names] if isinstance(module_names, str) else module_names self.server_remote_address = server_remote_address - self.private_modules = { + self.private_modules: Dict[str, nn.Module] = { name: self._get_module_by_name(self.model, name) for name in self.module_names } - self.remote_modules: dict = {} + self.remote_modules: Dict[str, RemoteModule] = {} self.private_q_modules: dict = {} - self.configuration: Configuration = None + self.configuration: Optional[Configuration] = None + self.model_name = model_name + self.verbose = verbose self._replace_modules() def _replace_modules(self): """Replace the private modules in the model with remote layers.""" - for name in self.module_names: + for module_name in self.module_names: # FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/3858 # Conv1d introduce reshaping operations which adds more TLU - self.private_modules[name] = convert_conv1d_to_linear(self.private_modules[name]) + self.private_modules[module_name] = convert_conv1d_to_linear( + self.private_modules[module_name] + ) - remote_module = RemoteModule(self.private_modules[name], self.server_remote_address) + remote_module = RemoteModule( + module=self.private_modules[module_name], + server_remote_address=self.server_remote_address, + module_name=module_name, + model_name=self.model_name, + verbose=self.verbose, + ) - self.remote_modules[name] = remote_module + self.remote_modules[module_name] = remote_module # Now we need to replace the module in its parent module. - *path, last = name.split(".") + *path, last = module_name.split(".") parent_module = ( self._get_module_by_name(self.model, ".".join(path)) if path else self.model ) @@ -167,10 +336,9 @@ def __call__(self, x: torch.Tensor, fhe: str = "disable") -> torch.Tensor: Returns: (torch.Tensor): The output tensor. """ - # Set the fhe mode in each remote module for module in self.remote_modules.values(): - module.fhe_local_mode = fhe + module.fhe_local_mode = FHEMode(fhe) x = self.model(x) return x @@ -188,28 +356,36 @@ def _get_module_by_name(model: nn.Module, name: str) -> Union[RemoteModule, nn.M Raises: ValueError: If no module found for the given name. """ + # TODO: Shouldn't this search recursively in name modules of name modules? for module_name, module in model.named_modules(): if module_name == name: return module raise ValueError(f"No module found for name {name}") - def init_client(self, path_to_client: str, path_to_keys: str): + def init_client( + self, path_to_clients: Optional[Path] = None, path_to_keys: Optional[Path] = None + ): """Initialize client for all remote modules. Args: path_to_client (str): Path to the client.zip files. path_to_keys (str): Path to the keys folder. """ - # TODO: https://github.com/zama-ai/concrete-ml-internal/issues/3869 - # implement client initialization + if path_to_clients is None: + path_to_clients = Path("clients") + path_to_clients.mkdir(exist_ok=True) + for module_name, module in self.remote_modules.items(): + path_to_client = path_to_clients / module_name + path_to_client.mkdir(exist_ok=True) + module.init_fhe_client(path_to_client=path_to_client, path_to_keys=path_to_keys) def compile_model( self, x: torch.Tensor, n_bits: int = 8, - rounding_threshold_bits: int = 8, - p_error=0.01, - configuration: Configuration = None, + rounding_threshold_bits: Optional[int] = 8, + p_error: float = 0.01, + configuration: Optional[Configuration] = None, ): """Compiles the specific layers to FHE. @@ -224,7 +400,12 @@ def compile_model( configuration (Configuration): A concrete Configuration object specifying the FHE encryption parameters. If not specified, a default configuration is used. """ + # We do a forward pass where we accumulate inputs to use for compilation + for name in self.module_names: + # default is "calibrate" + self.remote_modules[name].fhe_local_mode = FHEMode.calibrate self.model(x) + self.configuration = configuration for name in self.module_names: @@ -244,22 +425,32 @@ def compile_model( self.remote_modules[name].private_q_module = self.private_q_modules[name] - def _save_fhe_circuit(self, path: Path): + def _save_fhe_circuit(self, path: Path, via_mlir=False): """Private method that saves the FHE circuits. Args: path (Path): The directory where the FHE circuit will be saved. """ - path = Path(path) - for name in self.module_names: + model_path = Path(path) + for module_name in self.module_names: + input_shapes = [ + tuple([elt.dim_value for elt in onnx_input.type.tensor_type.shape.dim]) + for onnx_input in self.private_q_modules[ + self.module_names[0] + ]._onnx_model.graph.input + ] + assert len(input_shapes) == 1, "Multi-input circuits not supported yet" + model_module_path = model_path.resolve() / module_name + model_module_path.mkdir(exist_ok=True) + model_module_shape_path = model_module_path / tuple_to_underscore_str(input_shapes[0]) model_dev = FHEModelDev( - str(path.resolve()) + f"/{name}_fhe_circuit", - self.private_q_modules[name], + str(model_module_shape_path.resolve()), + self.private_q_modules[module_name], ) - model_dev.save() + model_dev.save(via_mlir=via_mlir) - def save_and_clear_private_info(self, path: Path): + def save_and_clear_private_info(self, path: Path, via_mlir=False): """Save the PyTorch model to the provided path and also saves the corresponding FHE circuit. Args: @@ -269,7 +460,6 @@ def save_and_clear_private_info(self, path: Path): path.mkdir(parents=True, exist_ok=True) for name in self.module_names: module = self._get_module_by_name(self.model, name) - # Remove private information for attr in ["private_module", "calibration_data", "private_q_module"]: if hasattr(module, attr): @@ -280,7 +470,7 @@ def save_and_clear_private_info(self, path: Path): torch.save(self.model, model_path.resolve()) # Save the FHE circuit in the same directory - self._save_fhe_circuit(path) + self._save_fhe_circuit(path, via_mlir=via_mlir) def publish_to_hub(self): """Allow the user to push the model and FHE required files to HF Hub.""" diff --git a/tests/torch/test_hybrid_converter.py b/tests/torch/test_hybrid_converter.py index 1d7557f1b4..8b288dbee1 100644 --- a/tests/torch/test_hybrid_converter.py +++ b/tests/torch/test_hybrid_converter.py @@ -72,9 +72,10 @@ def run_hybrid_model_test( module_names = module_names if isinstance(module_names, list) else [module_names] + # TODO: fix it -> broken due to shape handling # List of files to check files = ["model.pth"] + [ - f"{module_name}_fhe_circuit/{file_name}" + f"{module_name}/{file_name}" for module_name in module_names for file_name in ["client.zip", "server.zip", "versions.json"] ] diff --git a/use_case_examples/hybrid_model/README.md b/use_case_examples/hybrid_model/README.md new file mode 100644 index 0000000000..f4017ea305 --- /dev/null +++ b/use_case_examples/hybrid_model/README.md @@ -0,0 +1,14 @@ +# Hybrid model + +This use case example showcases how to partially run layers in FHE. + +In this case we apply a fully connected layer of a GPT-2 model in FHE. + +## How to run this use-case + +0. Install additional requirements using `python -m pip install -r requirements.txt` +1. Compile GPT-2 model using `bash compile.sh` script +1. Run FHE server using `bash serve.sh` +1. Run FHE client using `python infer_hybrid_llm_generate.py` + - You will first be asked about the number of tokens that you want to generate + - Then you will be able to enter your prompt diff --git a/use_case_examples/hybrid_model/compile.sh b/use_case_examples/hybrid_model/compile.sh new file mode 100644 index 0000000000..acbf2f7777 --- /dev/null +++ b/use_case_examples/hybrid_model/compile.sh @@ -0,0 +1,6 @@ +#!/bin/bash +VIA_MLIR=0 +for INDEX in 2 3 +do + INDEX=$INDEX VIA_MLIR=$VIA_MLIR python compile_hybrid_llm.py +done diff --git a/use_case_examples/hybrid_model/compile_hybrid_llm.py b/use_case_examples/hybrid_model/compile_hybrid_llm.py new file mode 100644 index 0000000000..f88cf6de21 --- /dev/null +++ b/use_case_examples/hybrid_model/compile_hybrid_llm.py @@ -0,0 +1,140 @@ +"""Showcase for the hybrid model converter.""" + +import os +from copy import deepcopy +from pathlib import Path +from typing import List, Union + +import torch +from concrete.fhe import Configuration, ParameterSelectionStrategy +from datasets import load_dataset +from transformers import AutoModelForCausalLM, AutoTokenizer + +from concrete.ml.torch.hybrid_model import HybridFHEModel + + +def compile_model( + model_name: str, + model: torch.nn.Module, + inputs: torch.Tensor, + module_names: Union[str, List], + expected_accuracy, + models_dir: Path, +): + """Run the test for any model with its private module names.""" + + # Enable multi params/precision + configuration = Configuration( + single_precision=False, + parameter_selection_strategy=ParameterSelectionStrategy.MULTI, + ) + + # Create a hybrid model + hybrid_model = HybridFHEModel(model, module_names) + hybrid_model.compile_model( + inputs, + n_bits=8, + # setting it to None is not enough -> weird + rounding_threshold_bits=None, + configuration=configuration, + ) + + # Sanity checks + logits_simulate = hybrid_model(inputs, fhe="simulate").logits + logits_disable = hybrid_model(inputs, fhe="disable").logits + logits_original = model(inputs).logits + # Ensure logits_disable and logits_original return the same output for the logits + assert torch.allclose(logits_disable, logits_original, atol=1e-7), "Outputs do not match!" + # Compare the topk accuracy of the FHE simulate circuit vs. the original. + k = 100 + # Get the topk indices for logits_disable and logits_simulate + topk_disable = logits_disable.topk(k, dim=-1).indices + topk_simulate = logits_simulate.topk(k, dim=-1).indices + # Prepare tensors for broadcasting + expanded_simulate = topk_simulate.unsqueeze(-1) + expanded_disable = topk_disable.unsqueeze(-2) + # Compute if elements of topk_simulate are in topk_disable for each token + is_in = (expanded_simulate == expanded_disable).any(-1) + # Compute average of these counts (the accuracy) + accuracy = is_in.float().mean() + # Make sure accuracy is above a certain threshold + if accuracy >= expected_accuracy: + print("Expected accuracy GPT2 hybrid not matched.") + + # Compilation + models_dir.mkdir(exist_ok=True) + model_dir = models_dir / model_name + print(f"Saving to {model_dir}") + via_mlir = bool(int(os.environ.get("VIA_MLIR", 0))) + hybrid_model.save_and_clear_private_info(model_dir, via_mlir=via_mlir) + + +if __name__ == "__main__": + configs = [ + ("transformer.h.0.mlp", 0.934), # Full MLP + (["transformer.h.0.mlp", "transformer.h.1.mlp"], 0.42), # Two full MLPs + ("transformer.h.0.mlp.c_proj", 0.986), # only projection in MLP + ("transformer.h.0.attn.c_proj", 0.986), # only projection in MLP + ] + config_index = int(os.environ.get("INDEX", 2)) + config = configs[config_index][0] + expected_accuracy = configs[config_index][1] + + # Compilation should be done on CPU + device = "cpu" + print(f"Using device: {device}") + + # Get GPT2 from Huggingface + model_name = "gpt2" + model_name_no_special_char = model_name.replace("/", "_") + tokenizer = AutoTokenizer.from_pretrained(model_name) + model = AutoModelForCausalLM.from_pretrained( + model_name, + device_map=device, + trust_remote_code=True, + ) + + configuration = { + "model_name": model_name, + "model_name_no_special_char": model_name_no_special_char, + "configuration": config, + } + + # In this case we compile for only one sample + # We might want to compile for multiple samples + # To do this the easiest solution is to compile on contexts of different sizes. + # They should all have the same lengths + # We might hack something based on HuggingFace dataset with some truncation + # Without truncation or selection it would require some knowledge of the tokenizer + max_context_size = 20 + num_samples = 50 + + dataset = load_dataset("wikipedia", "20220301.en") + print(model) + models_dir = Path(__file__).parent / os.environ.get("MODELS_DIR_NAME", "compiled_models") + models_dir.mkdir(exist_ok=True) + + # Compile for different shapes + for context_size in range(1, max_context_size): + prompts = [] + counter = 0 + for sample in dataset["train"]: + encoded = tokenizer.encode(sample["text"], return_tensors="pt") + if encoded.shape[1] >= context_size: + counter += 1 + prompts.append(encoded[:, :context_size]) + if counter == num_samples: + break + compile_inputset = torch.cat(prompts).to(device) + print(context_size, "compilation") + assert isinstance(model, torch.nn.Module) + + # We modify the model in place, so to compile multiple times we need to deepcopy the model + compile_model( + f"{model_name}_{config_index}", + deepcopy(model), + compile_inputset, + config, + expected_accuracy, + models_dir=models_dir, + ) diff --git a/use_case_examples/hybrid_model/infer_hybrid_llm_generate.py b/use_case_examples/hybrid_model/infer_hybrid_llm_generate.py new file mode 100644 index 0000000000..7e19f694f0 --- /dev/null +++ b/use_case_examples/hybrid_model/infer_hybrid_llm_generate.py @@ -0,0 +1,89 @@ +"""Showcase for the hybrid model converter.""" +import time +from pathlib import Path + +import torch +from torch.backends import mps +from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer + +from concrete.ml.torch.hybrid_model import FHEMode, HybridFHEModel + +if __name__ == "__main__": + configs = [ + ("transformer.h.0.mlp", 0.934), + (["transformer.h.0.mlp", "transformer.h.1.mlp"], 0.42), + ("transformer.h.0.mlp.c_proj", 0.986), + ("transformer.h.0.attn.c_proj", 0.986), + ] + config_index = 3 + config = configs[config_index][0] + + device = "cpu" + if torch.cuda.is_available(): + device = "cuda" + if mps.is_available(): + device = "mps" + print(f"Using device: {device}") + + # Get GPT2 from Huggingface + # TODO: migrate to auto-model with model_name + model_name = "gpt2" + model_name_no_special_char = model_name.replace("/", "_") + tokenizer = AutoTokenizer.from_pretrained(model_name) + model = AutoModelForCausalLM.from_pretrained( + model_name, + device_map=device, + trust_remote_code=True, + ) + + # Modify model to use remote FHE server instead of local weights + hybrid_model = HybridFHEModel( + model, + config, + server_remote_address="http://0.0.0.0:8000", + model_name=f"{model_name}_{config_index}", + verbose=False, + ) + path_to_clients = Path(__file__).parent / "clients" + hybrid_model.init_client(path_to_clients=path_to_clients) + for module in hybrid_model.remote_modules.values(): + module.fhe_local_mode = FHEMode.remote + + # Run example + while True: + # Take inputs + num_tokens = input("Number of tokens:\n") + if not num_tokens: + num_tokens = 5 + else: + num_tokens = int(num_tokens) + prompt = input("Prompt:\n") + if not prompt: + prompt = "Computations on encrypted data can help" + + # Encode and send to device + input_ids = tokenizer.encode(prompt, return_tensors="pt") + assert isinstance(input_ids, torch.Tensor) + input_ids = input_ids.to(device=device) + + print("*" * 10) + print("*" * 10) + print(f"{input_ids.shape[1]} tokens in '{prompt}'") + print("*" * 10) + print("*" * 10) + + # Print words as they are generated + streamer = TextStreamer(tokenizer=tokenizer) + start = time.time() + output_ids = model.generate( + input_ids, max_new_tokens=num_tokens, use_cache=True, streamer=streamer + ) + end = time.time() + generated = tokenizer.decode(output_ids[0]) + + print(f"{end - start} seconds to generate") + print("*" * 10) + print("*" * 10) + print(generated) + print("*" * 10) + print("*" * 10) diff --git a/use_case_examples/hybrid_model/load_and_analyze_data.py b/use_case_examples/hybrid_model/load_and_analyze_data.py new file mode 100644 index 0000000000..7fe8b1851e --- /dev/null +++ b/use_case_examples/hybrid_model/load_and_analyze_data.py @@ -0,0 +1,32 @@ +import json +from collections import Counter + +import matplotlib.pyplot as plt +from datasets import load_dataset +from tqdm import tqdm + + +def main(): + """ + Load wikipedia dataset and plot lenghts of text histogram. + For now this considers only the number of characters but we could also consider some stats like + the number of tokens, unique tokens, etc ... + """ + dataset = load_dataset("wikipedia", "20220301.en") + lengths = [len(sample["text"]) for sample in tqdm(dataset["train"])] + count = Counter(lengths) + print(count) + with open("wikipedia_counts.json", "w") as file: + json.dump(count, file) + with open("wikipedia_values.json", "w") as file: + json.dump(lengths, file) + + # Matplotlib plot + plt.subplots() + plt.hist(lengths, bins=1000) + plt.yscale("log") + plt.savefig("lengths.png") + + +if __name__ == "__main__": + main() diff --git a/use_case_examples/hybrid_model/requirements.txt b/use_case_examples/hybrid_model/requirements.txt new file mode 100644 index 0000000000..4ec68127ac --- /dev/null +++ b/use_case_examples/hybrid_model/requirements.txt @@ -0,0 +1,3 @@ +datasets==2.14.4 +apache_beam==2.49.0 +mwparserfromhell==0.6.4 diff --git a/use_case_examples/hybrid_model/serve.sh b/use_case_examples/hybrid_model/serve.sh new file mode 100644 index 0000000000..9d7b8f4a48 --- /dev/null +++ b/use_case_examples/hybrid_model/serve.sh @@ -0,0 +1,13 @@ +#!/bin/bash +uname_str=$(uname) +echo "${uname_str}" +if [[ $uname_str != "Darwin" ]]; then + echo "Not Darwin" + # tune the cpu-list according to the resources you want to allocate to it + PATH_TO_MODELS="compiled_models" PORT=8000 taskset --cpu-list 0-12 python serve_model.py + # No-limit + # PATH_TO_MODELS="compiled_models" PORT=8000 python serve_model.py +else + echo "Darwin" + PATH_TO_MODELS="compiled_models" PORT=8000 python serve_model.py +fi diff --git a/use_case_examples/hybrid_model/serve_model.py b/use_case_examples/hybrid_model/serve_model.py new file mode 100644 index 0000000000..9609465205 --- /dev/null +++ b/use_case_examples/hybrid_model/serve_model.py @@ -0,0 +1,193 @@ +"""Hybrid Model Deployment Server. + +Routes: + - Get all names + - Get client.zip + - Add a key + - Compute +""" + +import ast +import io +import os +import time +import uuid +from collections import defaultdict +from functools import lru_cache +from pathlib import Path +from typing import Dict, List, Optional, Tuple, Union + +import uvicorn +from fastapi import FastAPI, Form, HTTPException, UploadFile +from fastapi.responses import FileResponse, StreamingResponse +from loguru import logger + +# No relative import here because when not used in the package itself +from concrete.ml.deployment import FHEModelServer + + +def underscore_str_to_tuple(tup): + return ast.literal_eval(tup.replace("p_", "(").replace("_p", ")").replace("_", ", ")) + + +if __name__ == "__main__": + app = FastAPI(debug=False) + # Model-name -> Module-Name -> Input-shape + + FILE_FOLDER = Path(__file__).parent + KEY_PATH = Path(os.environ.get("KEY_PATH", FILE_FOLDER / Path("user_keys"))) + KEY_PATH.mkdir(exist_ok=True) + MODELS_PATH = Path(os.environ.get("PATH_TO_MODELS", FILE_FOLDER / Path("model"))) + PORT = os.environ.get("PORT", "5000") + MODULES = defaultdict(dict) + # Populate modules -> could be done dynamically on each query tbh + for model_path in MODELS_PATH.iterdir(): # Model + # TODO: change with a struct/obj + model_name = model_path.name + MODULES[model_name] = defaultdict(dict) + for module_path in model_path.iterdir(): # Module + if not module_path.is_dir(): + continue + module_name = module_path.name + MODULES[model_name][module_name] = defaultdict(dict) + for input_shape_path in module_path.iterdir(): + if not input_shape_path.is_dir(): + continue + input_shape = str(underscore_str_to_tuple(input_shape_path.name)) + MODULES[model_name][module_name][input_shape] = { + "path": input_shape_path.resolve(), + "module_name": module_name, + "model_name": model_name, + "shape": input_shape, + } + + @lru_cache(maxsize=None) + def load_key(uid) -> bytes: + with open(KEY_PATH / str(uid), "rb") as file: + return file.read() + + def dump_key(key_bytes: bytes, uid: Union[uuid.UUID, str]) -> None: + with open(KEY_PATH / str(uid), "wb") as file: + file.write(key_bytes) + + @lru_cache(maxsize=None) + def get_circuit(model_name, module_name, input_shape): + return FHEModelServer(str(MODULES[model_name][module_name][input_shape]["path"])) + + def check_inputs(model_name: str, module_name: Optional[str], input_shape: Optional[Tuple]): + if model_name not in MODULES: + raise HTTPException( + status_code=500, + detail=f"provided names '{model_name}' does not match any known name", + ) + if module_name is not None and module_name not in MODULES[model_name]: + raise HTTPException( + status_code=500, + detail=f"provided names '{module_name}' does not match any known name" + f"{list(MODULES[model_name].keys())}", + ) + if input_shape is not None and input_shape not in MODULES[model_name][module_name]: + raise HTTPException( + status_code=500, + detail=f"provided names '{module_name}' does not match any known name" + f"{list(MODULES[model_name][module_name].keys())}", + ) + + @app.get("/list_models") + def list_models(): + return MODULES + + @app.get("/list_modules") + def list_modules(model_name: str = Form()): + check_inputs(model_name, None, None) + return MODULES[model_name] + + @app.get("/list_shapes") + def list_shapes(model_name: str = Form(), module_name: str = Form()): + check_inputs(model_name, module_name, None) + return MODULES[model_name][module_name] + + @app.get("/get_client") + def get_client(model_name: str = Form(), module_name: str = Form(), input_shape: str = Form()): + """Get client. + + Returns: + FileResponse: client.zip + + Raises: + HTTPException: if the file can't be find locally + """ + check_inputs(model_name, module_name, input_shape) + path_to_client = ( + MODULES[model_name][module_name][str(input_shape)]["path"] / "client.zip" + ).resolve() + if not path_to_client.exists(): + raise HTTPException(status_code=500, detail="Could not find client.") + return FileResponse(path_to_client, media_type="application/zip") + + @app.post("/add_key") + async def add_key( + key: UploadFile, + model_name: str = Form(), + module_name: str = Form(), + input_shape: str = Form(), + ): + """Add public key. + + Arguments: + key (UploadFile): public key + + Returns: + Dict[str, str] + - uid: uid a personal uid + """ + check_inputs(model_name, module_name, input_shape) + uid = str(uuid.uuid4()) + key_bytes = await key.read() + dump_key(key_bytes, uid) + # TODO: we should probably store for which circuit the key was generated for + # such that we can raise an error if the targeted keys does not match the correct circuit + return {"uid": uid} + + @app.post("/compute") + async def compute( + model_input: UploadFile, + uid: str = Form(), + model_name: str = Form(), + module_name: str = Form(), + input_shape: str = Form(), + ): # noqa: B008 + """Compute the circuit over encrypted input. + + Arguments: + model_input (UploadFile): input of the circuit + uid (str): uid of the public key to use + + Returns: + StreamingResponse: the result of the circuit + """ + check_inputs(model_name, module_name, input_shape) + start = time.time() + key_bytes = load_key(uid) + end = time.time() + logger.info(f"It took {end - start} seconds to load the key") + + start = time.time() + fhe = get_circuit(model_name, module_name, input_shape) + end = time.time() + logger.info(f"It took {end - start} seconds to load the circuit") + + start = time.time() + encrypted_results = fhe.run( + serialized_encrypted_quantized_data=await model_input.read(), + serialized_evaluation_keys=key_bytes, + ) + end = time.time() + logger.info(f"fhe inference of input of shape {input_shape} took {end - start}") + logger.info(f"Results size is {len(encrypted_results)/(1024**2)} Mb") + start = time.time() + return StreamingResponse( + io.BytesIO(encrypted_results), + ) + + uvicorn.run(app, host="0.0.0.0", port=int(PORT)) diff --git a/use_case_examples/llm/QGPT2Evaluate.ipynb b/use_case_examples/llm/QGPT2Evaluate.ipynb index a67580b04f..5c560095ec 100644 --- a/use_case_examples/llm/QGPT2Evaluate.ipynb +++ b/use_case_examples/llm/QGPT2Evaluate.ipynb @@ -695,5 +695,5 @@ } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 }