Skip to content

Commit

Permalink
feat: add encrypted data-frame API
Browse files Browse the repository at this point in the history
  • Loading branch information
RomanBredehoft committed Mar 11, 2024
1 parent 20afcc6 commit cfe56f3
Show file tree
Hide file tree
Showing 17 changed files with 2,144 additions and 764 deletions.
2 changes: 1 addition & 1 deletion ci/aws_ami_build_component.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ phases:
- apt install -y git-lfs
- git lfs install
- source venv/bin/activate
- python -m pip install pytest==7.1.1 pandas==1.3.0 tensorflow==2.12.0 tf2onnx==1.13.0 torchvision==0.14.1
- python -m pip install pytest==7.1.1 pandas==2.0.3 tensorflow==2.12.0 tf2onnx==1.13.0 torchvision==0.14.1

# We disable tests for test_deploy file because the instance does not have AWS CLI setup
- name: RunTests
Expand Down
34 changes: 18 additions & 16 deletions deps_licenses/licenses_mac_silicon_user.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ Name, Version, License
GitPython, 3.1.41, BSD License
PyYAML, 6.0.1, MIT License
anyio, 3.7.1, MIT License
boto3, 1.34.38, Apache Software License
botocore, 1.34.38, Apache Software License
boto3, 1.34.59, Apache Software License
botocore, 1.34.59, Apache Software License
brevitas, 0.8.0, UNKNOWN
certifi, 2023.7.22, Mozilla Public License 2.0 (MPL 2.0)
charset-normalizer, 3.3.2, MIT License
Expand All @@ -15,15 +15,15 @@ dill, 0.3.8, BSD License
exceptiongroup, 1.2.0, MIT License
fastapi, 0.103.2, MIT License
filelock, 3.13.1, The Unlicense (Unlicense)
flatbuffers, 23.5.26, Apache Software License
flatbuffers, 24.3.7, Apache Software License
fsspec, 2024.2.0, BSD License
gitdb, 4.0.11, BSD License
h11, 0.14.0, MIT License
huggingface-hub, 0.20.3, Apache Software License
huggingface-hub, 0.21.4, Apache Software License
humanfriendly, 10.0, MIT License
hummingbird-ml, 0.4.8, MIT License
idna, 3.6, BSD License
importlib-resources, 6.1.1, Apache Software License
importlib_resources, 6.1.3, Apache Software License
iniconfig, 2.0.0, MIT License
jmespath, 1.0.1, MIT License
joblib, 1.3.2, BSD License
Expand All @@ -35,15 +35,17 @@ onnxconverter-common, 1.13.0, MIT License
onnxmltools, 1.11.0, Apache Software License
onnxoptimizer, 0.3.10, Apache License v2.0
onnxruntime, 1.13.1, MIT License
packaging, 23.2, Apache Software License; BSD License
packaging, 24.0, Apache Software License; BSD License
pandas, 2.0.3, BSD License
pluggy, 1.4.0, MIT License
protobuf, 3.20.3, BSD-3-Clause
psutil, 5.9.8, BSD License
pydantic, 1.10.14, MIT License
pytest, 7.4.1, MIT License
pytest-json-report, 1.5.0, MIT
pytest-metadata, 3.1.0, Mozilla Public License 2.0 (MPL 2.0)
python-dateutil, 2.8.2, Apache Software License; BSD License
pytest-metadata, 3.1.1, Mozilla Public License 2.0 (MPL 2.0)
python-dateutil, 2.9.0.post0, Apache Software License; BSD License
pytz, 2024.1, MIT License
regex, 2023.12.25, Apache Software License
requests, 2.31.0, Apache Software License
s3transfer, 0.10.0, Apache Software License
Expand All @@ -55,19 +57,19 @@ skl2onnx, 1.12, Apache Software License
skops, 0.5.0, MIT
skorch, 0.11.0, new BSD 3-Clause
smmap, 5.0.1, BSD License
sniffio, 1.3.0, Apache Software License; MIT License
sniffio, 1.3.1, Apache Software License; MIT License
starlette, 0.27.0, BSD License
sympy, 1.12, BSD License
tabulate, 0.8.10, MIT License
threadpoolctl, 3.2.0, BSD License
tokenizers, 0.15.1, Apache Software License
threadpoolctl, 3.3.0, BSD License
tokenizers, 0.15.2, Apache Software License
tomli, 2.0.1, MIT License
torch, 1.13.1, BSD License
tqdm, 4.66.1, MIT License; Mozilla Public License 2.0 (MPL 2.0)
transformers, 4.37.2, Apache Software License
tqdm, 4.66.2, MIT License; Mozilla Public License 2.0 (MPL 2.0)
transformers, 4.38.2, Apache Software License
typing_extensions, 4.5.0, Python Software Foundation License
urllib3, 1.26.18, MIT License
tzdata, 2024.1, Apache Software License
urllib3, 2.0.7, MIT License
uvicorn, 0.21.1, BSD License
xgboost, 1.6.2, Apache Software License
z3-solver, 4.12.5.0, MIT License
zipp, 3.17.0, MIT License
z3-solver, 4.13.0.0, MIT License
2 changes: 1 addition & 1 deletion deps_licenses/licenses_mac_silicon_user.txt.md5
Original file line number Diff line number Diff line change
@@ -1 +1 @@
a923947bfb17b658ab8efe61d5cafe96
5a9bba8e0d895b33d546b439e35100d2
1,519 changes: 776 additions & 743 deletions poetry.lock

Large diffs are not rendered by default.

3 changes: 1 addition & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ onnx = "1.13.1"
scipy = "1.10.1"
numpy = "1.23.5"
protobuf = "3.20.3"
pandas = "^2.0.3"

# Deployment
boto3 = "^1.23.5"
Expand Down Expand Up @@ -90,8 +91,6 @@ nbqa = "^1.3.1"
darglint = "^1.8.1"
linkcheckmd = "^1.4.0"
keyring = "*"
# pandas is required for some of our notebooks but not by our source code
pandas = "^1.3.0"
jinja2 = "^3.1.2"
LinkChecker = "^10.1.0"
kaggle = "^1.5.12"
Expand Down
2 changes: 1 addition & 1 deletion script/make_utils/pytest_pypi_cml.sh
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ source "${PYPI_VENV}/bin/activate"
# Investigate a better way of managing these dependencies
# FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/2685
python -m pip install --upgrade pip
python -m pip install pytest==7.4.1 pandas==1.5.3 tensorflow==2.12.0 tf2onnx==1.15.0 torchvision==0.14.1
python -m pip install pytest==7.4.1 pandas==2.0.3 tensorflow==2.12.0 tf2onnx==1.15.0 torchvision==0.14.1

# Install additional pytest plugins
python -m pip install pytest-xdist==3.3.1
Expand Down
158 changes: 158 additions & 0 deletions src/concrete/ml/dataframe/client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
from typing import Dict, List

import numpy
import pandas

from .dataframe import EncryptedDataFrame
from .operator import EncryptedDataFrameOperator
from .utils import decrypt_elementwise, encrypt_elementwise


class EncryptedDataFrameClient:
"""Client object for pre-processing, encrypting and serializing data-frames to the server.
A client is defined by a list of operators and their associated FHE clients.
"""

def __init__(self, ops_kwargs: Dict, deployment_dir):

# Ordered encrypted pandas operators to consider
self.ops = [
EncryptedDataFrameOperator.init_and_check_metadata(
metadata_dir_path=deployment_dir, **op_kwargs
)
for op_kwargs in ops_kwargs
]

# Ordered FHE clients to consider
self.clients = [op.load_client(deployment_dir) for op in self.ops]

@staticmethod
def _validate_data_frame(df: pandas.DataFrame, min_value: int, max_value: int):
"""Check that the data-frame only contains values between the given min/max."""
columns_less_than_min = (df < min_value).any()
column_names_less_than_min = columns_less_than_min[columns_less_than_min].index.tolist()

if column_names_less_than_min:
raise ValueError(
f"Columns {column_names_less_than_min} contain values less than {min_value}, "
"which is not allowed."
+ " This is because 0 values are used to represent NaN values for FHE computations."
* min_value
== 0
)

columns_greater_than_max = (df > max_value).any()
column_names_greater_than_max = columns_greater_than_max[
columns_greater_than_max
].index.tolist()

if column_names_greater_than_max:
raise ValueError(
f"Columns {column_names_greater_than_max} contain values greater than {max_value}, "
"which is not allowed."
)

def generate_keys(self, force: bool):
"""Generate the keys for all fhe clients."""
for client in self.clients:
client.keygen(force)

def pre_process(self, input_df: pandas.DataFrame, op_position: int = 0) -> numpy.ndarray:
"""Pre-process the Pandas data-frame using the operator identified by the given position."""
# Make sure the given data-frame only contains values of the expected dtype (unsigned
# integers of a given bit-width)
# Additionally, forbid 0 values as they are then used to represent encrypted NaN values
min, max = self.ops[op_position].get_supported_min_max()
self._validate_data_frame(input_df, min, max)

# Replace NaN values with 0
input_df.fillna(0, inplace=True)

array_to_encrypt = input_df.to_numpy()

return array_to_encrypt

# Use first op's client to encrypt values by default
def encrypt(
self, array_to_encrypt: numpy.ndarray, force_keygen: bool = False, op_position: int = 0
):
"""Encrypt the values using the operator's FHE client identified by the given position."""
self.generate_keys(force=force_keygen)

op = self.ops[op_position]
client = self.clients[op_position]

encrypted_values = encrypt_elementwise(
array_to_encrypt,
client,
**op.get_encrypt_config(),
)

return encrypted_values

def generate_encrypted_nan_values(self):
"""Generate encrypted NaN representations for all FHE clients."""
for op, client in zip(self.ops, self.clients):
op.generate_encrypted_nan_value(client)

def retrieve_evaluation_keys(self):
"""Retrieve and store evaluations keys for all FHE clients."""
for op, client in zip(self.ops, self.clients):
op.retrieve_evaluation_keys(client)

def pre_process_encrypt_serialize(self, input_df: pandas.DataFrame, force_keygen: bool = False):
"""Process the Pandas data-frame, then encrypt and serialize the values.
We currently assume that input values are only made for the first operator, meaning they
are encrypted using this operator's FHE client.
"""
# TODO: for now, we assume that the only inputs to encrypt are the ones from the first op
first_op_position = 0

array_to_encrypt = self.pre_process(input_df, op_position=first_op_position)
encrypted_values = self.encrypt(
array_to_encrypt, force_keygen=force_keygen, op_position=first_op_position
)

encrypted_df_input = EncryptedDataFrame(encrypted_values, input_df.columns)

serialized_df_input = encrypted_df_input.to_dict()

return serialized_df_input

def get_serialized_ops(self):
"""Update and serialize the operators."""

self.generate_encrypted_nan_values()
self.retrieve_evaluation_keys()

serialized_ops = [op.to_dict() for op in self.ops]

return serialized_ops

# Use last op's client to decrypt values
def decrypt(self, encrypted_values: numpy.ndarray, op_position: int = -1):
"""Decrypt the values using the operator's FHE client identified by the given position."""
return decrypt_elementwise(encrypted_values, self.clients[op_position])

@staticmethod
def post_process(output_array: numpy.ndarray, output_column_names: List[str]):
"""Post-process the server's outputs and build a Pandas data-frame from them."""
# Replace 0 values by NaN
output_array_0_to_nan = numpy.where(output_array == 0, numpy.nan, output_array)

# Convert the array with a pandas data-frame
df = pandas.DataFrame(
output_array_0_to_nan,
columns=output_column_names,
)
return df

def deserialize_decrypt_post_process(self, server_output: Dict):
"""Process the server's outputs and provide them as a Pandas data-frame."""
encrypted_df_output = EncryptedDataFrame.from_dict(server_output)

output_array = self.decrypt(encrypted_df_output.encrypted_values)
output_data_frame = self.post_process(output_array, encrypted_df_output.column_names)
return output_data_frame
34 changes: 34 additions & 0 deletions src/concrete/ml/dataframe/dataframe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
from typing import Dict, List

import numpy

from .utils import deserialize_elementwise, serialize_elementwise


class EncryptedDataFrame:
"""Define an encrypted data-frame that can be serialized."""

def __init__(self, encrypted_values: numpy.ndarray, column_names: List[str]):
self.encrypted_values = encrypted_values
self.column_names = list(column_names)
self.column_names_to_index = {name: index for index, name in enumerate(column_names)}

def to_dict(self):
"""Serialize the instance to a dictionary."""
encrypted_values = serialize_elementwise(self.encrypted_values)

# A Numpy array is not serializable using JSON so we need to convert to a list
output_dict = {
"encrypted_values": encrypted_values.tolist(),
"column_names": self.column_names,
}

return output_dict

@classmethod
def from_dict(cls, dict_to_load: Dict):
"""Load an instance from a dictionary."""
encrypted_values = deserialize_elementwise(dict_to_load["encrypted_values"])
column_names = dict_to_load["column_names"]

return cls(encrypted_values, column_names)
Loading

0 comments on commit cfe56f3

Please sign in to comment.