From 4473764b214c6558bd0cab079fdd4bc352de8c7b Mon Sep 17 00:00:00 2001 From: Roman Bredehoft Date: Mon, 27 May 2024 16:27:52 +0200 Subject: [PATCH] chore: add checks for value error raises --- tests/pandas/test_pandas.py | 145 +++++++++++++++++++++++++++++++++++- 1 file changed, 141 insertions(+), 4 deletions(-) diff --git a/tests/pandas/test_pandas.py b/tests/pandas/test_pandas.py index c100c35009..1e64a03900 100644 --- a/tests/pandas/test_pandas.py +++ b/tests/pandas/test_pandas.py @@ -1,5 +1,6 @@ """Tests the encrypted data-frame API abd its coherence with Pandas""" +import copy import re import shutil import tempfile @@ -298,7 +299,7 @@ def test_save_load(): def check_invalid_merge_parameters(): - """Check that unsupported or invalid parameters for merge raise the correct errors.""" + """Check that unsupported or invalid parameters for merge raise correct errors.""" encrypted_df_left, encrypted_df_right = get_two_encrypted_dataframes() unsupported_pandas_parameters_and_values = [ @@ -345,7 +346,7 @@ def check_no_multi_columns_merge(): def check_column_coherence(): - """Check that merging data-frames with unsupported scheme raise the correct errors.""" + """Check that merging data-frames with unsupported scheme raises correct errors.""" index_name = "index" # Test when a selected column has a different dtype than the other one @@ -394,7 +395,7 @@ def check_column_coherence(): def check_unsupported_input_values(): - """Check that initializing a data-frame with unsupported inputs raise the correct errors.""" + """Check that initializing a data-frame with unsupported inputs raises correct errors.""" client = ClientEngine() # Test with integer values that are out of bound @@ -451,7 +452,7 @@ def check_unsupported_input_values(): def check_post_processing_coherence(): - """Check post-processing a data-frame with unsupported scheme raise the correct errors.""" + """Check post-processing a data-frame with unsupported scheme raises correct errors.""" index_name = "index" client = ClientEngine() @@ -480,6 +481,8 @@ def test_error_raises(): check_column_coherence() check_unsupported_input_values() check_post_processing_coherence() + check_invalid_schema_format() + check_invalid_schema_values() def deserialize_client_file(client_path: Union[Path, str]) -> ClientSpecs: @@ -643,3 +646,137 @@ def test_schema_input(): assert pandas_dataframe_are_equal( clear_df_joined_1, pandas_joined_df, float_atol=1, equal_nan=True ), "Joined encrypted data-frame does not match Pandas' joined data-frame." + + +def check_invalid_schema_format(): + """Check that encrypting data-frames with an unsupported schema format raises correct errors.""" + selected_column = "index" + + with tempfile.TemporaryDirectory() as temp_dir: + keys_path = Path(temp_dir) / "keys" + + client = ClientEngine(keys_path=keys_path) + + pandas_df = generate_pandas_dataframe(index_name=selected_column) + + with pytest.raises( + ValueError, + match="When set, parameter 'schema' must be a dictionary.*", + ): + client.encrypt_from_pandas(pandas_df, schema=[]) + + schema_wrong_column = {"wrong_column": {}} + + with pytest.raises( + ValueError, + match="Column name '.*' found in the given schema cannot be found.*", + ): + client.encrypt_from_pandas(pandas_df, schema=schema_wrong_column) + + schema_wrong_mapping_type = {selected_column: []} + + with pytest.raises( + ValueError, + match="Mapping for column '.*' is not a dictionary. .*", + ): + client.encrypt_from_pandas(pandas_df, schema=schema_wrong_mapping_type) + + +def check_invalid_schema_values(): + """Check that encrypting data-frames with an unsupported schema values raises correct errors.""" + selected_column = "index" + feat_name = "feat" + float_min = -10.0 + float_max = 10.0 + + with tempfile.TemporaryDirectory() as temp_dir: + keys_path = Path(temp_dir) / "keys" + + client = ClientEngine(keys_path=keys_path) + + pandas_df = generate_pandas_dataframe( + feat_name=feat_name, index_name=selected_column, float_min=float_min, float_max=float_max + ) + + schema_int_column = {f"{feat_name}_int_1": {}} + + with pytest.raises( + ValueError, + match="Column '.*' contains integer values and therefore does not.*", + ): + client.encrypt_from_pandas(pandas_df, schema=schema_int_column) + + schema_float_column = {f"{feat_name}_float_1": {"wrong_mapping": 1.0}} + + with pytest.raises( + ValueError, + match="Column '.*' contains float values but the associated mapping.*", + ): + client.encrypt_from_pandas(pandas_df, schema=schema_float_column) + + schema_float_oob = {f"{feat_name}_float_1": {"min": float_min // 2, "max": float_max // 2}} + + with pytest.raises( + ValueError, + match=r"Column '.*' \(dtype=float64\) contains values that are out of bounds.*", + ): + client.encrypt_from_pandas(pandas_df, schema=schema_float_oob) + + string_column = f"{feat_name}_str_1" + + schema_string_nan = {string_column: {numpy.NaN: 1}} + + with pytest.raises( + ValueError, + match="String mapping for column '.*' contains numpy.NaN as a key.*", + ): + client.encrypt_from_pandas(pandas_df, schema=schema_string_nan) + + schema_string_missing_values = {string_column: {"apple": 1}} + + with pytest.raises( + ValueError, + match="String mapping keys for column '.*' are not considering all values.*", + ): + client.encrypt_from_pandas(pandas_df, schema=schema_string_missing_values) + + # Retrieve the string column's unique values and create a mapping, except for numpy.NaN values + string_values = pandas_df[string_column].unique() + string_values = [ + string_value for string_value in string_values if isinstance(string_value, str) + ] + string_mapping = {val: i for i, val in enumerate(string_values)} + + string_mapping_non_int = copy.copy(string_mapping) + string_mapping_non_int[string_values[0]] = "orange" + + schema_string_non_int = {string_column: string_mapping_non_int} + + with pytest.raises( + ValueError, + match="String mapping values for column '.*' must be integers.*", + ): + client.encrypt_from_pandas(pandas_df, schema=schema_string_non_int) + + string_mapping_oob = copy.copy(string_mapping) + string_mapping_oob[string_values[0]] = -1 + + schema_string_oob = {string_column: string_mapping_oob} + + with pytest.raises( + ValueError, + match="String mapping values for column '.*' are out of bounds.*", + ): + client.encrypt_from_pandas(pandas_df, schema=schema_string_oob) + + string_mapping_non_unique = copy.copy(string_mapping) + string_mapping_non_unique[string_values[0]] = 1 + string_mapping_non_unique[string_values[1]] = 1 + + schema_string_non_unique = {string_column: string_mapping_non_unique} + + with pytest.raises( + ValueError, + match="String mapping values for column '.*' must be unique.*", + ): + client.encrypt_from_pandas(pandas_df, schema=schema_string_non_unique)