From 152a2e2e5a7b93a779e0f683499d48d1bf4802ec Mon Sep 17 00:00:00 2001
From: Roman <56846628+RomanBredehoft@users.noreply.github.com>
Date: Wed, 4 Oct 2023 16:16:09 +0200
Subject: [PATCH] chore: improve inference tests (decision_function, predict,
 predict_proba) + post_processing

---
 conftest.py                              |  59 +-
 src/concrete/ml/sklearn/base.py          |  23 +-
 src/concrete/ml/sklearn/neighbors.py     |  28 +-
 tests/deployment/test_client_server.py   | 242 ++++----
 tests/quantization/test_quantized_ops.py |  24 +-
 tests/quantization/test_quantizers.py    |   4 +-
 tests/sklearn/test_dump_onnx.py          |   2 +-
 tests/sklearn/test_sklearn_models.py     | 680 ++++++++++++-----------
 8 files changed, 561 insertions(+), 501 deletions(-)

diff --git a/conftest.py b/conftest.py
index b073cc3bb..fbd86bc27 100644
--- a/conftest.py
+++ b/conftest.py
@@ -288,41 +288,40 @@ def check_circuit_precision():
     return check_circuit_precision_impl
 
 
-def check_array_equality_impl(actual: Any, expected: Any, verbose: bool = True):
-    """Assert that `actual` is equal to `expected`."""
-
-    assert numpy.array_equal(actual, expected), (
-        ""
-        if not verbose
-        else f"""
+@pytest.fixture
+def check_array_equal():
+    """Fixture to check array equality."""
 
-Expected Output
-===============
-{expected}
+    def check_array_equal_impl(actual: Any, expected: Any, verbose: bool = True):
+        """Assert that `actual` is equal to `expected`."""
 
-Actual Output
-=============
-{actual}
+        assert numpy.array_equal(actual, expected), (
+            ""
+            if not verbose
+            else f"""
 
-        """
-    )
+    Expected Output
+    ===============
+    {expected}
 
+    Actual Output
+    =============
+    {actual}
 
-@pytest.fixture
-def check_array_equality():
-    """Fixture to check array equality."""
+            """
+        )
 
-    return check_array_equality_impl
+    return check_array_equal_impl
 
 
 @pytest.fixture
-def check_float_arrays_equal():
+def check_float_array_equal():
     """Fixture to check if two float arrays are equal with epsilon precision tolerance."""
 
-    def check_float_arrays_equal_impl(a, b):
+    def check_float_array_equal_impl(a, b):
         assert numpy.all(numpy.isclose(a, b, rtol=0, atol=0.001))
 
-    return check_float_arrays_equal_impl
+    return check_float_array_equal_impl
 
 
 @pytest.fixture
@@ -492,15 +491,13 @@ def check_is_good_execution_for_cml_vs_circuit_impl(
                     # as much post-processing steps in the clear (that could lead to more flaky
                     # tests), especially since these results are tested in other tests such as the
                     # `check_subfunctions_in_fhe`
-                    if is_classifier_or_partial_classifier(model):
-                        if isinstance(model, SklearnKNeighborsMixin):
-                            # For KNN `predict_proba` is not supported for now
-                            # FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/3962
-                            results_cnp_circuit = model.predict(*inputs, fhe=fhe_mode)
-                            results_model = model.predict(*inputs, fhe="disable")
-                        else:
-                            results_cnp_circuit = model.predict_proba(*inputs, fhe=fhe_mode)
-                            results_model = model.predict_proba(*inputs, fhe="disable")
+                    # For KNN `predict_proba` is not supported for now
+                    # FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/3962
+                    if is_classifier_or_partial_classifier(model) and not isinstance(
+                        model, SklearnKNeighborsMixin
+                    ):
+                        results_cnp_circuit = model.predict_proba(*inputs, fhe=fhe_mode)
+                        results_model = model.predict_proba(*inputs, fhe="disable")
 
                     else:
                         results_cnp_circuit = model.predict(*inputs, fhe=fhe_mode)
diff --git a/src/concrete/ml/sklearn/base.py b/src/concrete/ml/sklearn/base.py
index 1978f0fdf..4d7ae7251 100644
--- a/src/concrete/ml/sklearn/base.py
+++ b/src/concrete/ml/sklearn/base.py
@@ -260,15 +260,6 @@ def fhe_circuit(self) -> Optional[Circuit]:
         assert isinstance(self.fhe_circuit_, Circuit) or self.fhe_circuit_ is None
         return self.fhe_circuit_
 
-    @fhe_circuit.setter
-    def fhe_circuit(self, value: Circuit) -> None:
-        """Set the FHE circuit.
-
-        Args:
-            value (Circuit): The FHE circuit to set.
-        """
-        self.fhe_circuit_ = value
-
     def _sklearn_model_is_not_fitted_error_message(self) -> str:
         return (
             f"The underlying model (class: {self.sklearn_model_class}) is not fitted and thus "
@@ -556,7 +547,7 @@ def compile(
 
         # Jit compiler is now deprecated and will soon be removed, it is thus forced to False
         # by default
-        self.fhe_circuit = module_to_compile.compile(
+        self.fhe_circuit_ = module_to_compile.compile(
             inputset,
             configuration=configuration,
             artifacts=artifacts,
@@ -570,6 +561,9 @@ def compile(
             jit=False,
         )
 
+        # For mypy
+        assert isinstance(self.fhe_circuit, Circuit)
+
         # CRT simulation is not supported yet
         # TODO: https://github.com/zama-ai/concrete-ml-internal/issues/3841
         if not USE_OLD_VL:
@@ -577,7 +571,6 @@ def compile(
 
         self._is_compiled = True
 
-        assert isinstance(self.fhe_circuit, Circuit)
         return self.fhe_circuit
 
     @abstractmethod
@@ -883,10 +876,6 @@ def output_quantizers(self, value: List[UniformQuantizer]) -> None:
     def fhe_circuit(self) -> Circuit:
         return self.quantized_module_.fhe_circuit
 
-    @fhe_circuit.setter
-    def fhe_circuit(self, value: Circuit) -> None:
-        self.quantized_module_.fhe_circuit = value
-
     def get_params(self, deep: bool = True) -> dict:
         """Get parameters for this estimator.
 
@@ -2093,11 +2082,11 @@ def predict(self, X: Data, fhe: Union[FheMode, str] = FheMode.DISABLE) -> numpy.
 
         topk_labels = []
         for query in X:
-            topk_labels.append(super().predict(query[None], fhe))
+            topk_labels.append(BaseEstimator.predict(self, query[None], fhe=fhe))
 
         y_preds = self.post_processing(numpy.array(topk_labels))
 
-        return numpy.array(y_preds)
+        return y_preds
 
 
 class SklearnKNeighborsClassifierMixin(SklearnKNeighborsMixin, sklearn.base.ClassifierMixin, ABC):
diff --git a/src/concrete/ml/sklearn/neighbors.py b/src/concrete/ml/sklearn/neighbors.py
index 368c9690b..c17180964 100644
--- a/src/concrete/ml/sklearn/neighbors.py
+++ b/src/concrete/ml/sklearn/neighbors.py
@@ -1,11 +1,12 @@
 """Implement sklearn linear model."""
-from typing import Any, Dict
+from typing import Any, Dict, Union
 
 import numpy
 import sklearn.linear_model
 
 from ..common.debugging.custom_assert import assert_true
-from .base import SklearnKNeighborsClassifierMixin
+from ..common.utils import FheMode
+from .base import Data, SklearnKNeighborsClassifierMixin
 
 
 # pylint: disable=invalid-name,too-many-instance-attributes
@@ -123,3 +124,26 @@ def load_dict(cls, metadata: Dict):
         obj.metric_params = metadata["metric_params"]
         obj.n_jobs = metadata["n_jobs"]
         return obj
+
+    # KNeighborsClassifier does not provide a predict_proba method for now
+    # FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/3962
+    def predict_proba(self, X: Data, fhe: Union[FheMode, str] = FheMode.DISABLE) -> numpy.ndarray:
+        """Predict class probabilities.
+
+        Args:
+            X (Data): The input values to predict, as a Numpy array, Torch tensor, Pandas DataFrame
+                or List.
+            fhe (Union[FheMode, str]): The mode to use for prediction.
+                Can be FheMode.DISABLE for Concrete ML Python inference,
+                FheMode.SIMULATE for FHE simulation and FheMode.EXECUTE for actual FHE execution.
+                Can also be the string representation of any of these values.
+                Default to FheMode.DISABLE.
+
+        Raises:
+            NotImplementedError: The method is not implemented for now.
+        """
+
+        raise NotImplementedError(
+            "The `predict_proba` method is not implemented for KNeighborsClassifier. Please "
+            "call `predict` instead."
+        )
diff --git a/tests/deployment/test_client_server.py b/tests/deployment/test_client_server.py
index ab3b2d2f4..b0507bfc5 100644
--- a/tests/deployment/test_client_server.py
+++ b/tests/deployment/test_client_server.py
@@ -22,7 +22,7 @@
 
 
 class OnDiskNetwork:
-    """Simulate a network on disk."""
+    """A network interaction on disk."""
 
     def __init__(self):
         # Create 3 temporary folder for server, client and dev with tempfile
@@ -75,8 +75,10 @@ def test_client_server_sklearn(
     n_bits,
     load_data,
     check_is_good_execution_for_cml_vs_circuit,
+    check_array_equal,
+    check_float_array_equal,
 ):
-    """Tests the encrypt decrypt api."""
+    """Test the client-server interface for built-in models."""
 
     if get_model_name(model_class) == "KNeighborsClassifier":
         # Skipping KNN for this test
@@ -97,172 +99,192 @@ def test_client_server_sklearn(
         warnings.simplefilter("ignore", category=ConvergenceWarning)
         model.fit(x_train, y_train)
 
-    # Compile
-    extra_params = {"global_p_error": 1 / 100_000}
+    key_dir = default_configuration.insecure_key_cache_location
 
     # Running the simulation using a model that is not compiled should not be possible
     with pytest.raises(AttributeError, match=".* model is not compiled.*"):
-        client_server_simulation(x_train, x_test, model, default_configuration)
+        check_client_server_execution(
+            x_test, model, key_dir, check_array_equal, check_float_array_equal
+        )
+
+    # Compile the model
+    fhe_circuit = model.compile(x_train, configuration=default_configuration)
 
-    fhe_circuit = model.compile(x_train, default_configuration, **extra_params, show_mlir=False)
+    # Check that client and server files are properly generated
+    check_client_server_files(model)
 
     max_bit_width = fhe_circuit.graph.maximum_integer_bit_width()
     print(f"Max width {max_bit_width}")
 
-    # Compare the FHE predictions with the clear ones.
-    # Note that:
-    # - With a global_p_error of 1/100_000 we only allow one run.
-    # - The simulated predictions are not considered in this test.
+    # Compare the FHE predictions with the clear ones. Simulated predictions are not considered in
+    # this test.
     check_is_good_execution_for_cml_vs_circuit(x_test, model, simulate=False, n_allowed_runs=1)
 
     # Check client/server FHE predictions vs the FHE predictions of the dev model
-    client_server_simulation(x_train, x_test, model, default_configuration)
+    check_client_server_execution(
+        x_test, model, key_dir, check_array_equal, check_float_array_equal
+    )
 
 
 def test_client_server_custom_model(
-    default_configuration, check_is_good_execution_for_cml_vs_circuit
+    default_configuration,
+    check_is_good_execution_for_cml_vs_circuit,
+    check_array_equal,
+    check_float_array_equal,
 ):
-    """Tests the client server custom model."""
+    """Test the client-server interface for a custom model (through a quantized module)."""
 
     # Generate random data
     x_train, x_test = numpy.random.rand(100, 2), numpy.random.rand(1, 2)
 
+    key_dir = default_configuration.insecure_key_cache_location
+
     # Running the simulation using a QuantizedModule that is not compiled should not be possible
     with pytest.raises(AttributeError, match=".* quantized module is not compiled.*"):
         # Instantiate an empty QuantizedModule object
         quantized_module = QuantizedModule()
 
-        client_server_simulation(x_train, x_test, quantized_module, default_configuration)
+        check_client_server_execution(
+            x_test, quantized_module, key_dir, check_array_equal, check_float_array_equal
+        )
 
     torch_model = FCSmall(2, nn.ReLU)
-    n_bits = 2
 
-    # Get the quantized module from the model
+    # Get the quantized module from the model and compile it
     quantized_numpy_module = compile_torch_model(
         torch_model,
         x_train,
         configuration=default_configuration,
-        n_bits=n_bits,
-        global_p_error=1 / 100_000,
+        n_bits=2,
     )
 
+    # Check that client and server files are properly generated
+    check_client_server_files(quantized_numpy_module)
+
     # Check that the FHE execution is correct.
-    # With a global_p_error of 1/100_000 we only allow one run.
     check_is_good_execution_for_cml_vs_circuit(
         x_test, quantized_numpy_module, simulate=False, n_allowed_runs=1
     )
 
-    client_server_simulation(x_train, x_test, quantized_numpy_module, default_configuration)
-
-
-def client_server_simulation(x_train, x_test, model, default_configuration):
-    """Simulate the client server interaction."""
-    # Model has been trained and compiled on the server.
-    # Now we use the fhe api to go into production.
-
-    # Set up the fake network
-    network = OnDiskNetwork()
-
-    # Instantiate the dev client and server FHEModel client server API
-    fhemodel_dev = FHEModelDev(path_dir=network.dev_dir.name, model=model)
-    fhemodel_dev.save()
-
-    # Check that the processing json file is in the client.zip file
-    with zipfile.ZipFile(Path(network.dev_dir.name) / "client.zip") as client_zip:
-        with client_zip.open("serialized_processing.json", "r") as file:
-            assert isinstance(json.load(file), dict)
-
-    # Send necessary files to server and client
-    network.dev_send_clientspecs_and_modelspecs_to_client()
-    network.dev_send_model_to_server()
-
-    # Make sure the save fails now that the folder is populated
-    err_msg = (
-        f"path_dir: {network.dev_dir.name} is not empty."
-        "Please delete it before saving a new model."
-    )
-    with pytest.raises(Exception, match=err_msg):
-        fhemodel_dev.save()
-
-    fhemodel_client = FHEModelClient(
-        path_dir=network.client_dir.name,
-        key_dir=default_configuration.insecure_key_cache_location,
+    check_client_server_execution(
+        x_test, quantized_numpy_module, key_dir, check_array_equal, check_float_array_equal
     )
-    fhemodel_client.load()
 
-    # Grab the model and save it again
-    # No user is expected to load a FHEModelDev instance from a FHEModelClient's model. This is
-    # only made a testing for making sure the model has the expected attributes
-    client_model = fhemodel_client.model
-    client_model.fhe_circuit = model.fhe_circuit
 
-    # pylint: disable-next=protected-access
-    client_model._is_compiled = True
-
-    network.cleanup()
+def check_client_server_files(model):
+    """Test the client server interface API generates the expected file.
 
+    This test expects that the given model has been trained and compiled in development.
+    """
     # Create a new network
-    network = OnDiskNetwork()
+    disk_network = OnDiskNetwork()
 
     # And try to save it again
-    fhemodel_dev_ = FHEModelDev(path_dir=network.dev_dir.name, model=client_model)
-    fhemodel_dev_.save()
-
-    # Send necessary files to server and client
-    network.dev_send_clientspecs_and_modelspecs_to_client()
-    network.dev_send_model_to_server()
+    fhe_model_dev = FHEModelDev(path_dir=disk_network.dev_dir.name, model=model)
+    fhe_model_dev.save()
+
+    # Check that re-saving the dev model fails
+    with pytest.raises(
+        Exception,
+        match=(
+            f"path_dir: {disk_network.dev_dir.name} is not empty."
+            "Please delete it before saving a new model."
+        ),
+    ):
+        fhe_model_dev.save()
+
+    client_zip_path = Path(disk_network.dev_dir.name) / "client.zip"
+    server_zip_path = Path(disk_network.dev_dir.name) / "server.zip"
+
+    # Check that client and server zip files has been generated
+    assert (
+        client_zip_path.is_file()
+    ), f"Client files were not properly generated. Expected {client_zip_path} to be a file."
+    assert (
+        server_zip_path.is_file()
+    ), f"Server files were not properly generated. Expected {server_zip_path} to be a file."
+
+    processing_file_name = "serialized_processing.json"
+    versions_file_name = "versions.json"
+
+    # Check that the client.zip file has the processing and versions json files
+    with zipfile.ZipFile(client_zip_path) as client_zip:
+        with client_zip.open(processing_file_name, "r") as file:
+            assert isinstance(
+                json.load(file), dict
+            ), f"{client_zip_path} does not contain a '{processing_file_name}' file."
+
+        with client_zip.open(versions_file_name, "r") as file:
+            assert isinstance(
+                json.load(file), dict
+            ), f"{client_zip_path} does not contain a '{versions_file_name}' file."
+
+    # Check that the server.zip file has the versions json file
+    with zipfile.ZipFile(server_zip_path) as server_zip:
+        with server_zip.open("versions.json", "r") as file:
+            assert isinstance(
+                json.load(file), dict
+            ), f"{server_zip_path} does not contain a '{versions_file_name}' file."
 
-    # And try to load it again
-    fhemodel_client_ = FHEModelClient(
-        path_dir=network.client_dir.name,
-        key_dir=default_configuration.insecure_key_cache_location,
-    )
-    fhemodel_client_.load()
+    # Clean up
+    disk_network.cleanup()
 
-    # Now we can also load the server part
-    fhemodel_server = FHEModelServer(path_dir=network.server_dir.name)
-    fhemodel_server.load()
 
-    # Make sure the client has the exact same quantization as the server
-    qx_ref_model = model.quantize_input(x_train)
-    qx_client = fhemodel_client.model.quantize_input(x_train)
-    qx_dev = fhemodel_dev.model.quantize_input(x_train)
-    numpy.testing.assert_array_equal(qx_ref_model, qx_client)
-    numpy.testing.assert_array_equal(qx_ref_model, qx_dev)
+def check_client_server_execution(
+    x_test, model, key_dir, check_array_equal, check_float_array_equal
+):
+    """Test the client server interface API.
 
-    # Create evaluation keys for the server
-    fhemodel_client.generate_private_and_evaluation_keys()
+    This test expects that the given model has been trained and compiled in development. It
+    basically replicates a production-like interaction and checks that results are on matching the
+    development model.
+    """
+    # Create a new network
+    disk_network = OnDiskNetwork()
 
-    # Get the server evaluation key
-    serialized_evaluation_keys = fhemodel_client.get_serialized_evaluation_keys()
+    # Save development files
+    fhe_model_dev = FHEModelDev(path_dir=disk_network.dev_dir.name, model=model)
+    fhe_model_dev.save()
 
-    # Encrypt new data
-    serialized_qx_new_encrypted = fhemodel_client.quantize_encrypt_serialize(x_test)
+    # Send necessary files to server and client
+    disk_network.dev_send_clientspecs_and_modelspecs_to_client()
+    disk_network.dev_send_model_to_server()
 
-    # Here data can be saved, sent over the network, etc.
+    # Load the client
+    fhe_model_client = FHEModelClient(
+        path_dir=disk_network.client_dir.name,
+        key_dir=key_dir,
+    )
+    fhe_model_client.load()
 
-    # Now back to the server
+    # Load the server
+    fhe_model_server = FHEModelServer(path_dir=disk_network.server_dir.name)
+    fhe_model_server.load()
 
-    # Run the model over encrypted data
-    serialized_result = fhemodel_server.run(serialized_qx_new_encrypted, serialized_evaluation_keys)
+    # Client side : Generate all keys and serialize the evaluation keys for the server
+    fhe_model_client.generate_private_and_evaluation_keys()
+    evaluation_keys = fhe_model_client.get_serialized_evaluation_keys()
 
-    # Back to the client
+    # Client side : Encrypt the data
+    q_x_encrypted_serialized = fhe_model_client.quantize_encrypt_serialize(x_test)
 
-    # Decrypt, de-quantize and post-processed the result
-    y_pred_on_client_quantized = fhemodel_client.deserialize_decrypt(serialized_result)
-    y_pred_on_client_dequantized = fhemodel_client.deserialize_decrypt_dequantize(serialized_result)
+    # Server side: Run the model over encrypted data
+    q_y_pred_encrypted_serialized = fhe_model_server.run(q_x_encrypted_serialized, evaluation_keys)
 
-    # Get the y_pred_model_server_clear
+    # Client side : Decrypt, de-quantize and post-process the result
+    q_y_pred = fhe_model_client.deserialize_decrypt(q_y_pred_encrypted_serialized)
+    y_pred = fhe_model_client.deserialize_decrypt_dequantize(q_y_pred_encrypted_serialized)
 
-    # Predict based on the model we are testing
-    qtest = model.quantize_input(x_test)
-    y_pred_model_dev_quantized = model.fhe_circuit.encrypt_run_decrypt(qtest)
-    y_pred_model_dev_dequantized = model.dequantize_output(y_pred_model_dev_quantized)
-    y_pred_model_dev_dequantized = model.post_processing(y_pred_model_dev_dequantized)
+    # Dev side: Predict using the model and circuit from development
+    q_x_test = model.quantize_input(x_test)
+    q_y_pred_dev = model.fhe_circuit.encrypt_run_decrypt(q_x_test)
+    y_pred_dev = model.dequantize_output(q_y_pred_dev)
+    y_pred_dev = model.post_processing(y_pred_dev)
 
-    # Make sure the quantized predictions are the same for the client model and the dev model
-    numpy.testing.assert_array_equal(y_pred_on_client_quantized, y_pred_model_dev_quantized)
-    numpy.testing.assert_array_equal(y_pred_on_client_dequantized, y_pred_model_dev_dequantized)
+    # Check that both quantized and de-quantized (+ post-processed) results from the server are
+    # matching the ones from the dec model
+    check_float_array_equal(y_pred, y_pred_dev)
+    check_array_equal(q_y_pred, q_y_pred_dev)
 
     # Clean up
-    network.cleanup()
+    disk_network.cleanup()
diff --git a/tests/quantization/test_quantized_ops.py b/tests/quantization/test_quantized_ops.py
index d3e7a15ee..fd4023850 100644
--- a/tests/quantization/test_quantized_ops.py
+++ b/tests/quantization/test_quantized_ops.py
@@ -325,7 +325,7 @@ def test_all_arith_ops(
     n_dims: int,
     generator: Callable,
     check_r2_score: Callable,
-    check_float_arrays_equal: Callable,
+    check_float_array_equal: Callable,
 ):
     """Test all quantized arithmetic ops"""
 
@@ -407,11 +407,11 @@ def test_all_arith_ops(
 
     # Check that we get the same fp32 results in V+V (if supported), V+C and C+V modes
     if supports_enc_with_enc:
-        check_float_arrays_equal(raw_output_vv, raw_output_vc)
-    check_float_arrays_equal(raw_output_cv, raw_output_vc)
+        check_float_array_equal(raw_output_vv, raw_output_vc)
+    check_float_array_equal(raw_output_cv, raw_output_vc)
 
     # Check that V+C and C+V is symmetric (int+float mode)
-    check_float_arrays_equal(quantized_output_cv, quantized_output_vc)
+    check_float_array_equal(quantized_output_cv, quantized_output_vc)
 
     # As V+C and C+V work on float values they will not be exactly equal to
     # the V+V case which works in quantized, we only check R2 for a high bit-width in this case
@@ -454,7 +454,7 @@ def test_all_gemm_ops(
     n_neurons: int,
     generator: Callable,
     check_r2_score: Callable,
-    check_array_equality: Callable,
+    check_array_equal: Callable,
 ):
     """Test for gemm style ops."""
 
@@ -547,7 +547,7 @@ def test_all_gemm_ops(
     check_r2_score(expected_gemm_outputs, actual_gemm_output)
 
     # Without a bias, MatMul and Gemm should give the same output
-    check_array_equality(actual_mm_output, actual_gemm_output)
+    check_array_equal(actual_mm_output, actual_gemm_output)
 
     # Test the serialization of QuantizedGemm with (alpha, beta) = (1, 0)
     check_serialization(
@@ -672,7 +672,7 @@ def test_identity_op(x, n_bits):
 )
 @pytest.mark.parametrize("produces_output", [True, False])
 # pylint: disable-next=too-many-locals
-def test_quantized_conv(params, n_bits, produces_output, check_r2_score, check_float_arrays_equal):
+def test_quantized_conv(params, n_bits, produces_output, check_r2_score, check_float_array_equal):
     """Test the quantized convolution operator."""
 
     # Retrieve arguments
@@ -733,7 +733,7 @@ def test_quantized_conv(params, n_bits, produces_output, check_r2_score, check_f
         strides,
         groups=group,
     ).numpy()
-    check_float_arrays_equal(torch_res, expected_result)
+    check_float_array_equal(torch_res, expected_result)
 
     # Compute the quantized result
     result = q_op(q_input).dequant()
@@ -810,7 +810,7 @@ def test_quantized_conv(params, n_bits, produces_output, check_r2_score, check_f
     ],
 )
 @pytest.mark.parametrize("is_signed", [True, False])
-def test_quantized_avg_pool(params, n_bits, is_signed, check_r2_score, check_float_arrays_equal):
+def test_quantized_avg_pool(params, n_bits, is_signed, check_r2_score, check_float_array_equal):
     """Test the quantized average pool operator."""
 
     # Retrieve arguments
@@ -842,7 +842,7 @@ def test_quantized_avg_pool(params, n_bits, is_signed, check_r2_score, check_flo
     # Compute the torch average pool
     bceil_mode = bool(ceil_mode)
     torch_res = torch.nn.functional.avg_pool2d(tx_pad, kernel_shape, strides, 0, bceil_mode).numpy()
-    check_float_arrays_equal(torch_res, expected_result)
+    check_float_array_equal(torch_res, expected_result)
 
     # Compute the quantized result
     result = q_op(q_input).dequant()
@@ -929,7 +929,7 @@ def test_quantized_avg_pool(params, n_bits, is_signed, check_r2_score, check_flo
     ],
 )
 @pytest.mark.parametrize("is_signed", [True, False])
-def test_quantized_max_pool(params, n_bits, is_signed, check_r2_score, check_float_arrays_equal):
+def test_quantized_max_pool(params, n_bits, is_signed, check_r2_score, check_float_array_equal):
     """Test the quantized max pool operator."""
 
     # Retrieve arguments
@@ -974,7 +974,7 @@ def test_quantized_max_pool(params, n_bits, is_signed, check_r2_score, check_flo
     print("Expected")
     print(expected_result)
 
-    check_float_arrays_equal(torch_res, expected_result)
+    check_float_array_equal(torch_res, expected_result)
 
     # Compute the quantized result
     result = q_op(q_input).dequant()
diff --git a/tests/quantization/test_quantizers.py b/tests/quantization/test_quantizers.py
index fcbbb6bf9..62d8b675a 100644
--- a/tests/quantization/test_quantizers.py
+++ b/tests/quantization/test_quantizers.py
@@ -22,7 +22,7 @@
     [pytest.param(True, True), pytest.param(True, False), pytest.param(False, False)],
 )
 @pytest.mark.parametrize("values", [pytest.param(numpy.random.randn(2000))])
-def test_quant_dequant_update(values, n_bits, is_signed, is_symmetric, check_array_equality):
+def test_quant_dequant_update(values, n_bits, is_signed, is_symmetric, check_array_equal):
     """Test the quant and de-quant function."""
 
     quant_array = QuantizedArray(n_bits, values, is_signed=is_signed, is_symmetric=is_symmetric)
@@ -72,7 +72,7 @@ def test_quant_dequant_update(values, n_bits, is_signed, is_symmetric, check_arr
     assert not numpy.array_equal(new_values, new_values_updated)
 
     # Check that the __call__ returns also the qvalues.
-    check_array_equality(quant_array(), new_qvalues)
+    check_array_equal(quant_array(), new_qvalues)
 
 
 @pytest.mark.parametrize(
diff --git a/tests/sklearn/test_dump_onnx.py b/tests/sklearn/test_dump_onnx.py
index 94e688622..3f72bfea6 100644
--- a/tests/sklearn/test_dump_onnx.py
+++ b/tests/sklearn/test_dump_onnx.py
@@ -36,7 +36,7 @@ def check_onnx_file_dump(model_class, parameters, load_data, str_expected, defau
         model.set_params(**model_params)
 
     if get_model_name(model) == "KNeighborsClassifier":
-        # KNN works only for small quantization bits
+        # KNN can only be compiled with small quantization bit numbers for now
         # FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/3979
         model.n_bits = 2
 
diff --git a/tests/sklearn/test_sklearn_models.py b/tests/sklearn/test_sklearn_models.py
index b95e616be..ab6b6bbff 100644
--- a/tests/sklearn/test_sklearn_models.py
+++ b/tests/sklearn/test_sklearn_models.py
@@ -19,12 +19,6 @@
   - pipeline
   - calls to predict_proba
   - calls to decision_function
-
-Are currently missing
-  - check of predict_proba
-  - check of decision_function
-
-More information in https://github.com/zama-ai/concrete-ml-internal/issues/2682
 """
 
 import copy
@@ -81,20 +75,11 @@
 # sufficiently number of bits for precision
 N_BITS_THRESHOLD_FOR_SKLEARN_CORRECTNESS_TESTS = 26
 
-# We check correctness with check_is_good_execution_for_cml_vs_circuit or predict in
-# fhe="disable" only if n_bits >= N_BITS_THRESHOLD_FOR_PREDICT_CORRECTNESS_TESTS. This is
-# because we need sufficiently number of bits for precision
-N_BITS_THRESHOLD_FOR_PREDICT_CORRECTNESS_TESTS = 6
-
 # We never do checks with check_is_good_execution_for_cml_vs_circuit if
 # n_bits >= N_BITS_THRESHOLD_TO_FORCE_EXECUTION_NOT_IN_FHE. This is because computations are very
 # slow
 N_BITS_THRESHOLD_TO_FORCE_EXECUTION_NOT_IN_FHE = 17
 
-assert (
-    N_BITS_THRESHOLD_FOR_PREDICT_CORRECTNESS_TESTS <= N_BITS_THRESHOLD_TO_FORCE_EXECUTION_NOT_IN_FHE
-)
-
 # If n_bits >= N_BITS_THRESHOLD_FOR_SKLEARN_EQUIVALENCE_TESTS, we check that the two models
 # returned by fit_benchmark (the Concrete ML model and the scikit-learn model) are equivalent
 N_BITS_THRESHOLD_FOR_SKLEARN_EQUIVALENCE_TESTS = 16
@@ -105,9 +90,9 @@
 N_BITS_LINEAR_MODEL_CRYPTO_PARAMETERS = 11
 
 # n_bits that we test, either in regular builds or just in weekly builds. 6 is to do tests in
-# FHE which are not too long (relation with N_BITS_THRESHOLD_FOR_PREDICT_CORRECTNESS_TESTS and
-# N_BITS_THRESHOLD_TO_FORCE_EXECUTION_NOT_IN_FHE). 26 is in relation with
-# N_BITS_THRESHOLD_FOR_SKLEARN_CORRECTNESS_TESTS, to do tests with check_correctness_with_sklearn
+# FHE which are not too long (relation with N_BITS_THRESHOLD_TO_FORCE_EXECUTION_NOT_IN_FHE).
+# 26 is in relation with N_BITS_THRESHOLD_FOR_SKLEARN_CORRECTNESS_TESTS, to do tests with
+# check_correctness_with_sklearn
 N_BITS_REGULAR_BUILDS = [6, 26]
 N_BITS_WEEKLY_ONLY_BUILDS = [2, 8, 16]
 
@@ -125,7 +110,7 @@ def get_dataset(model_class, parameters, n_bits, load_data, is_weekly_option):
         model_class, _get_sklearn_linear_models() + _get_sklearn_neighbors_models()
     ):
         if n_bits in N_BITS_WEEKLY_ONLY_BUILDS and not is_weekly_option:
-            pytest.skip("Skipping some tests in non-weekly builds, except for linear models")
+            pytest.skip("Skipping some tests in non-weekly builds")
 
     # Get the data-set. The data generation is seeded in load_data.
     x, y = load_data(model_class, **parameters)
@@ -154,6 +139,19 @@ def preamble(model_class, parameters, n_bits, load_data, is_weekly_option):
     return model, x
 
 
+def get_n_bits_non_correctness(model_class):
+    """Get the number of bits to use for non correctness related tests."""
+
+    if get_model_name(model_class) == "KNeighborsClassifier":
+        # KNN can only be compiled with small quantization bit numbers for now
+        # FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/3979
+        n_bits = 2
+    else:
+        n_bits = min(N_BITS_REGULAR_BUILDS)
+
+    return n_bits
+
+
 def check_correctness_with_sklearn(
     model_class,
     x,
@@ -176,19 +174,11 @@ def check_correctness_with_sklearn(
         warnings.simplefilter("ignore", category=ConvergenceWarning)
         model, sklearn_model = model.fit_benchmark(x, y)
 
-    y_pred = model.predict(x)
-
-    y_pred_sklearn = sklearn_model.predict(x)
-    y_pred_cml = model.predict(x, fhe=fhe)
-
-    # Check that the output shapes are correct
-    assert y_pred.shape == y_pred_cml.shape, "Outputs have different shapes"
-
     # FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/2604
     # Generic tests look to show issues in accuracy / R2 score, even for high n_bits
 
-    # For regressions
-    acceptance_r2score_dic = {
+    # For R2 score measures
+    acceptance_r2scores = {
         "TweedieRegressor": 0.9,
         "GammaRegressor": 0.9,
         "LinearRegression": 0.9,
@@ -198,33 +188,79 @@ def check_correctness_with_sklearn(
         "Ridge": 0.9,
         "ElasticNet": 0.9,
         "XGBRegressor": -0.2,
-        "NeuralNetRegressor": -10,
     }
 
-    # For classifiers
-    threshold_accuracy_dic = {
+    # For accuracy measures
+    threshold_accuracies = {
         "LogisticRegression": 0.9,
         "LinearSVC": 0.9,
         "XGBClassifier": 0.7,
         "RandomForestClassifier": 0.8,
-        "NeuralNetClassifier": 0.7,
         "KNeighborsClassifier": 0.9,
     }
 
     model_name = get_model_name(model_class)
-    acceptance_r2score = acceptance_r2score_dic.get(model_name, 0.9)
-    threshold_accuracy = threshold_accuracy_dic.get(model_name, 0.9)
+    acceptance_r2score = acceptance_r2scores.get(model_name, 0.9)
+    threshold_accuracy = threshold_accuracies.get(model_name, 0.9)
+
+    # If the model is a classifier
+    # KNeighborsClassifier does not provide a predict_proba method for now
+    # FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/3962
+    if (
+        is_classifier_or_partial_classifier(model)
+        and get_model_name(model_class) != "KNeighborsClassifier"
+    ):
+        if is_model_class_in_a_list(model, _get_sklearn_linear_models()):
+
+            # Check outputs from the 'decision_function' method (for linear classifiers)
+            y_scores_sklearn = sklearn_model.decision_function(x)
+            y_scores_fhe = model.decision_function(x, fhe=fhe)
+
+            # Currently, for single target data sets, Concrete models' outputs have shape (n, 1)
+            # while scikit-learn models' outputs have shape (n, )
+            # FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/4029
+            # assert y_scores_sklearn.shape == y_scores_fhe.shape, (
+            #     "Method 'decision_function' outputs different shapes between scikit-learn and "
+            #     f"Concrete ML in FHE (fhe={fhe})"
+            # )
+            check_r2_score(y_scores_sklearn, y_scores_fhe, acceptance_score=acceptance_r2score)
+
+        # LinearSVC models from scikit-learn do not provide a 'predict_proba' method
+        if get_model_name(model_class) != "LinearSVC":
+
+            # Check outputs from the 'predict_proba' method (for all classifiers,
+            # except KNeighborsClassifier)
+            y_proba_sklearn = sklearn_model.predict_proba(x)
+            y_proba_fhe = model.predict_proba(x, fhe=fhe)
+
+            assert y_proba_sklearn.shape == y_proba_fhe.shape, (
+                "Method 'decision_function' outputs different shapes between scikit-learn and "
+                f"Concrete ML in FHE (fhe={fhe})"
+            )
+            check_r2_score(y_proba_sklearn, y_proba_fhe, acceptance_score=acceptance_r2score)
+
+    # Check outputs from the 'predict_proba' method (for all models)
+    y_pred_sklearn = sklearn_model.predict(x)
+    y_pred_fhe = model.predict(x, fhe=fhe)
+
+    # Currently, for single target data sets, Concrete models' outputs have shape (n, 1) while
+    # scikit-learn models' outputs have shape (n, )
+    # FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/4029
+    # assert y_pred_sklearn.shape == y_pred_fhe.shape, (
+    #     "Method 'predict' outputs different shapes between scikit-learn and "
+    #     f"Concrete ML in FHE (fhe={fhe})"
+    # )
 
     # If the model is a classifier, check that accuracies are similar
     if is_classifier_or_partial_classifier(model):
-        check_accuracy(y_pred_sklearn, y_pred_cml, threshold=threshold_accuracy)
+        check_accuracy(y_pred_sklearn, y_pred_fhe, threshold=threshold_accuracy)
 
     # If the model is a regressor, check that R2 scores are similar
+    elif is_regressor_or_partial_regressor(model):
+        check_r2_score(y_pred_sklearn, y_pred_fhe, acceptance_score=acceptance_r2score)
+
     else:
-        assert is_regressor_or_partial_regressor(
-            model
-        ), "not a regressor, not a classifier, really?"
-        check_r2_score(y_pred_sklearn, y_pred_cml, acceptance_score=acceptance_r2score)
+        raise AssertionError(f"Model {model_name} is neither a classifier nor a regressor.")
 
 
 def check_double_fit(model_class, n_bits, x_1, x_2, y_1, y_2):
@@ -462,10 +498,8 @@ def check_offset(model_class, n_bits, x, y):
         model.fit(x, y)
 
 
-def check_subfunctions(fitted_model, model_class, x):
-    """Check subfunctions."""
-
-    fitted_model.predict(x[:1])
+def check_inference_methods(model, model_class, x, check_float_array_equal):
+    """Check that all inference methods provided are coherent between clear and FHE executions."""
 
     # skorch provides a predict_proba method for neural network regressors while Scikit-Learn does
     # not. We decided to follow Scikit-Learn's API as we build most of our tools on this library.
@@ -474,7 +508,7 @@ def check_subfunctions(fitted_model, model_class, x):
     # confusion, a NotImplementedError is raised. This issue could be fixed by making these classes
     # not inherit from skorch.
     # FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/3373
-    if get_model_name(fitted_model) == "NeuralNetRegressor":
+    if get_model_name(model) == "NeuralNetRegressor":
         with pytest.raises(
             NotImplementedError,
             match=(
@@ -482,67 +516,128 @@ def check_subfunctions(fitted_model, model_class, x):
                 "Please call `predict` instead."
             ),
         ):
-            fitted_model.predict_proba(x)
+            model.predict_proba(x)
 
-    if get_model_name(fitted_model) == "KNeighborsClassifier":
-        # FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/3962
-        pytest.skip("Skipping subfunctions test for KNN, doesn't work for now")
-
-    if is_classifier_or_partial_classifier(model_class):
+    # KNeighborsClassifier does not provide a predict_proba method for now
+    # FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/3962
+    elif get_model_name(model) == "KNeighborsClassifier":
+        with pytest.raises(
+            NotImplementedError,
+            match=(
+                "The `predict_proba` method is not implemented for KNeighborsClassifier. "
+                "Please call `predict` instead."
+            ),
+        ):
+            model.predict_proba(x)
 
-        fitted_model.predict_proba(x)
+    # Only check 'predict_proba' and not 'predict' as some issues were found with the argmax not
+    # being consistent because of precision errors with epsilon magnitude. This argmax should be
+    # done in the clear the same way for both anyway. Ultimately, we would want to only compare the
+    # circuit's quantized outputs against the ones computed in the clear but built-in models do not
+    # currently provide the necessary API for that
+    elif is_classifier_or_partial_classifier(model_class):
 
-        # Only linear classifiers have a decision function method
         if is_model_class_in_a_list(model_class, _get_sklearn_linear_models()):
-            fitted_model.decision_function(x)
 
+            # Check outputs from the 'decision_function' method (for all linear classifiers)
+            y_scores_clear = model.decision_function(x)
+            y_scores_simulated = model.decision_function(x, fhe="simulate")
+
+            assert y_scores_clear.shape == y_scores_simulated.shape, (
+                "Method 'decision_function' from Concrete ML outputs different shapes when executed"
+                "in the clear and with simulation."
+            )
+            check_float_array_equal(y_scores_clear, y_scores_simulated)
 
-def check_subfunctions_in_fhe(model, fhe_circuit, x):
-    """Check subfunctions in FHE: calls and correctness."""
+        else:
+            # Check outputs from the 'predict_proba' method (for all non-linear classifiers,
+            # except KNeighborsClassifier)
+            y_proba_clear = model.predict_proba(x)
+            y_proba_simulated = model.predict_proba(x, fhe="simulate")
+
+            assert y_proba_clear.shape == y_proba_simulated.shape, (
+                "Method 'predict_proba' from Concrete ML outputs different shapes when executed"
+                "in the clear and with simulation."
+            )
+            check_float_array_equal(y_proba_clear, y_proba_simulated)
+
+    else:
+        # Check outputs from the 'predict' method (for all regressors and KNeighborsClassifier)
+        y_pred_clear = model.predict(x)
+        y_pred_simulated = model.predict(x, fhe="simulate")
+
+        assert y_pred_clear.shape == y_pred_simulated.shape, (
+            "Method 'predict' from Concrete ML outputs different shapes when executed in the clear "
+            "and with simulation."
+        )
+        check_float_array_equal(y_pred_clear, y_pred_simulated)
+
+
+def check_separated_inference(model, fhe_circuit, x, check_float_array_equal):
+    """Run inference methods in separated steps and check their correctness."""
 
     # Generate the keys
     fhe_circuit.keygen()
 
-    y_pred_fhe = []
+    # Quantize an input (float)
+    q_x = model.quantize_input(x)
 
-    for _ in range(N_ALLOWED_FHE_RUN):
-        for f_input in x:
-            # Quantize an input (float)
-            q_input = model.quantize_input(f_input.reshape(1, -1))
+    # Encrypt the input
+    q_x_encrypted = fhe_circuit.encrypt(q_x)
 
-            # Encrypt the input
-            q_input_enc = fhe_circuit.encrypt(q_input)
+    # Execute the linear product in FHE
+    q_y_pred_encrypted = fhe_circuit.run(q_x_encrypted)
 
-            # Execute the linear product in FHE
-            q_y_enc = fhe_circuit.run(q_input_enc)
+    # Decrypt the result (integer)
+    q_y_pred = fhe_circuit.decrypt(q_y_pred_encrypted)
 
-            # Decrypt the result (integer)
-            q_y = fhe_circuit.decrypt(q_y_enc)
+    # De-quantize the result
+    y_pred = model.dequantize_output(q_y_pred)
 
-            # De-quantize the result
-            y = model.dequantize_output(q_y)
+    if is_model_class_in_a_list(
+        model, _get_sklearn_linear_models(classifier=True, regressor=False)
+    ):
+        y_scores = model.decision_function(x, fhe="simulate")
+
+        # For linear classifiers, the circuit's de-quantized outputs should be the same as the ones
+        # from the `decision_function` built-in method
+        check_float_array_equal(y_pred, y_scores)
+
+    # Apply post-processing step (in the clear)
+    # This includes (non-exhaustive):
+    # - sigmoid or softmax function for classifiers
+    # - final sum for tree-based models
+    # - link function for GLMs
+    y_pred = model.post_processing(y_pred)
+
+    # KNeighborsClassifier does not provide a predict_proba method for now
+    # FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/3962
+    if (
+        is_classifier_or_partial_classifier(model)
+        and get_model_name(model) != "KNeighborsClassifier"
+    ):
+        y_proba = model.predict_proba(x, fhe="simulate")
+    else:
+        y_proba = model.predict(x, fhe="simulate")
 
-            # Apply either the sigmoid if it is a binary classification task,
-            # which is the case in this example, or a softmax function in order
-            # to get the probabilities (in the clear)
-            y_proba = model.post_processing(y)
+    # The circuit's de-quantized outputs followed by `post_processing` should be the same as the
+    # ones from the `predict_proba` built-in method for classifiers, and from the `predict`
+    # built-in method for regressors
+    check_float_array_equal(y_pred, y_proba)
 
-            # Apply the argmax to get the class predictions (in the clear)
-            if is_classifier_or_partial_classifier(model):
-                y_class = numpy.argmax(y_proba, axis=-1)
-                y_pred_fhe += list(y_class)
-            else:
-                y_pred_fhe += list(y_proba)
+    # KNeighborsClassifier does not apply a final argmax for computing prediction
+    if (
+        is_classifier_or_partial_classifier(model)
+        and get_model_name(model) != "KNeighborsClassifier"
+    ):
+        y_pred = numpy.argmax(y_pred, axis=-1)
 
-        # Compare with the FHE simulation mode
-        y_pred_expected_in_simulation = model.predict(x, fhe="simulate")
-        if numpy.isclose(numpy.array(y_pred_fhe), y_pred_expected_in_simulation).all():
-            break
+        y_pred_class = model.predict(x, fhe="simulate")
 
-    assert numpy.isclose(numpy.array(y_pred_fhe), y_pred_expected_in_simulation).all(), (
-        "computations are not the same between individual functions (in FHE) "
-        "and predict function (in FHE simulation mode)"
-    )
+        # For classifiers (other than KNeighborsClassifier), the circuit's de-quantized outputs
+        # followed by `post_processing` as well as an argmax should be the same as the ones from
+        # the `predict` built-in method
+        check_float_array_equal(y_pred, y_pred_class)
 
 
 def check_input_support(model_class, n_bits, default_configuration, x, y, input_type):
@@ -582,24 +677,26 @@ def cast_input(x, y, input_type):
     model.predict(x)
 
     # Similarly, we test `predict_proba` for classifiers
-    if is_classifier_or_partial_classifier(model):
-        if get_model_name(model_class) == "KNeighborsClassifier":
-            # FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/3962
-            pytest.skip("Skipping predict_proba for KNN, doesn't work for now")
+    # KNeighborsClassifier does not provide a predict_proba method for now
+    # FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/3962
+    if (
+        is_classifier_or_partial_classifier(model)
+        and get_model_name(model_class) != "KNeighborsClassifier"
+    ):
         model.predict_proba(x)
 
-    # If n_bits is above N_BITS_LINEAR_MODEL_CRYPTO_PARAMETERS, do not compile the model
-    # as there won't be any crypto parameters
-    if n_bits >= N_BITS_LINEAR_MODEL_CRYPTO_PARAMETERS:
-        return
-
     model.compile(x, default_configuration)
 
     # Make sure `predict` is working when FHE is disabled
     model.predict(x, fhe="simulate")
 
     # Similarly, we test `predict_proba` for classifiers
-    if is_classifier_or_partial_classifier(model):
+    # KNeighborsClassifier does not provide a predict_proba method for now
+    # FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/3962
+    if (
+        is_classifier_or_partial_classifier(model)
+        and get_model_name(model_class) != "KNeighborsClassifier"
+    ):
         model.predict_proba(x, fhe="simulate")
 
 
@@ -678,11 +775,12 @@ def check_grid_search(model_class, x, y, scoring):
         warnings.simplefilter("ignore", category=ConvergenceWarning)
         warnings.simplefilter("ignore", category=UndefinedMetricWarning)
 
+        # KNeighborsClassifier does not provide a predict_proba method for now
+        # FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/3962
         if get_model_name(model_class) == "KNeighborsClassifier" and scoring in [
             "roc_auc",
             "average_precision",
         ]:
-            # FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/3962
             pytest.skip("Skipping predict_proba for KNN, doesn't work for now")
 
         _ = GridSearchCV(
@@ -690,57 +788,6 @@ def check_grid_search(model_class, x, y, scoring):
         ).fit(x, y)
 
 
-def check_sklearn_equivalence(model_class, n_bits, x, y, check_accuracy, check_r2_score):
-    """Check equivalence between the two models returned by fit_benchmark: the Concrete ML model and
-    the scikit-learn model."""
-    model = instantiate_model_generic(model_class, n_bits=n_bits)
-
-    # Sometimes, we miss convergence, which is not a problem for our test
-    with warnings.catch_warnings():
-        warnings.simplefilter("ignore", category=ConvergenceWarning)
-
-        # Random state should be taken from the method parameter
-        model, sklearn_model = model.fit_benchmark(x, y)
-
-    # If the model is a classifier
-    if is_classifier_or_partial_classifier(model):
-
-        # Check that accuracies are similar
-        y_pred_cml = model.predict(x)
-        y_pred_sklearn = sklearn_model.predict(x)
-        check_accuracy(y_pred_sklearn, y_pred_cml)
-
-        # If the model is a LinearSVC model, compute its predicted confidence score
-        # This is done separately as scikit-learn doesn't provide a predict_proba method for
-        # LinearSVC models
-        if get_model_name(model_class) == "LinearSVC":
-            y_pred_cml = model.decision_function(x)
-            y_pred_sklearn = sklearn_model.decision_function(x)
-
-        # Else, compute the model's predicted probabilities
-        # predict_proba not implemented for KNeighborsClassifier for now
-        # FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/3962
-        elif get_model_name(model_class) != "KNeighborsClassifier":
-            y_pred_cml = model.predict_proba(x)
-            y_pred_sklearn = sklearn_model.predict_proba(x)
-
-    # If the model is a regressor, compute its predictions
-    else:
-        y_pred_cml = model.predict(x)
-        y_pred_sklearn = sklearn_model.predict(x)
-
-    # Check that predictions, probabilities or confidence scores are similar using the R2 score
-    check_r2_score(y_pred_sklearn, y_pred_cml)
-
-
-def check_properties_of_circuit(model_class, fhe_circuit, check_circuit_has_no_tlu):
-    """Check some properties of circuit, depending on the model class"""
-
-    if is_model_class_in_a_list(model_class, _get_sklearn_linear_models()):
-        # Check that no TLUs are found within the MLIR
-        check_circuit_has_no_tlu(fhe_circuit)
-
-
 def get_hyper_param_combinations(model_class):
     """Return the hyper_param_combinations, depending on the model class"""
     hyper_param_combinations: Dict[str, List[Any]]
@@ -799,7 +846,6 @@ def check_hyper_parameters(
     n_bits,
     x,
     y,
-    test_correctness_in_clear,
     check_r2_score,
     check_accuracy,
 ):
@@ -815,11 +861,6 @@ def check_hyper_parameters(
 
         model = instantiate_model_generic(model_class, n_bits=n_bits, **hyper_parameters)
 
-        # FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/2450
-        # does not work for now, issue in HummingBird
-        if get_model_name(model_class) == "RandomForestClassifier" and n_bits == 2:
-            continue
-
         # Also fit with these hyper parameters to check it works fine
         with warnings.catch_warnings():
             # Sometimes, we miss convergence, which is not a problem for our test
@@ -828,18 +869,17 @@ def check_hyper_parameters(
             # Here, we really need to fit, to take into account hyper parameters
             model.fit(x, y)
 
-        # Check correctness with sklearn (if we have sufficiently bits of precision)
-        if test_correctness_in_clear and n_bits >= N_BITS_THRESHOLD_FOR_SKLEARN_CORRECTNESS_TESTS:
-            check_correctness_with_sklearn(
-                model_class,
-                x,
-                y,
-                n_bits,
-                check_r2_score,
-                check_accuracy,
-                fhe="disable",
-                hyper_parameters=hyper_parameters,
-            )
+        # Check correctness with sklearn
+        check_correctness_with_sklearn(
+            model_class,
+            x,
+            y,
+            n_bits,
+            check_r2_score,
+            check_accuracy,
+            fhe="disable",
+            hyper_parameters=hyper_parameters,
+        )
 
 
 def check_fitted_compiled_error_raises(model_class, n_bits, x, y):
@@ -869,9 +909,13 @@ def check_fitted_compiled_error_raises(model_class, n_bits, x, y):
         with pytest.raises(AttributeError, match=".* model is not fitted.*"):
             model.predict(x)
 
-    if is_classifier_or_partial_classifier(model_class):
-        if get_model_name(model) == "KNeighborsClassifier":
-            pytest.skip("predict_proba not implement for KNN")
+    # KNeighborsClassifier does not provide a predict_proba method for now
+    # FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/3962
+    if (
+        is_classifier_or_partial_classifier(model_class)
+        and get_model_name(model) != "KNeighborsClassifier"
+    ):
+
         # Predicting probabilities using an untrained linear or tree-based classifier should not
         # be possible
         if not is_model_class_in_a_list(model_class, _get_sklearn_neural_net_models()):
@@ -1086,49 +1130,9 @@ def check_load_fitted_sklearn_linear_models(model_class, n_bits, x, y):
     + get_sklearn_tree_models_and_datasets()
     + get_sklearn_neighbors_models_and_datasets(),
 )
-@pytest.mark.parametrize(
-    "n_bits",
-    [
-        n
-        for n in N_BITS_WEEKLY_ONLY_BUILDS + N_BITS_REGULAR_BUILDS
-        if n >= N_BITS_THRESHOLD_FOR_SKLEARN_EQUIVALENCE_TESTS
-    ],
-)
-def test_quantization(
-    model_class,
-    parameters,
-    n_bits,
-    load_data,
-    check_r2_score,
-    check_accuracy,
-    is_weekly_option,
-    verbose=True,
-):
-    """Test quantization."""
-    x, y = get_dataset(model_class, parameters, n_bits, load_data, is_weekly_option)
-
-    if verbose:
-        print("Run check_sklearn_equivalence")
-
-    check_sklearn_equivalence(model_class, n_bits, x, y, check_accuracy, check_r2_score)
-
-
-# This test is a known flaky
-# FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/3661
-@pytest.mark.flaky
-@pytest.mark.parametrize("model_class, parameters", MODELS_AND_DATASETS)
-@pytest.mark.parametrize(
-    "n_bits",
-    [
-        n
-        for n in N_BITS_WEEKLY_ONLY_BUILDS + N_BITS_REGULAR_BUILDS
-        if n >= N_BITS_THRESHOLD_FOR_SKLEARN_CORRECTNESS_TESTS
-    ],
-)
 def test_correctness_with_sklearn(
     model_class,
     parameters,
-    n_bits,
     load_data,
     check_r2_score,
     check_accuracy,
@@ -1136,9 +1140,11 @@ def test_correctness_with_sklearn(
     verbose=True,
 ):
     """Test that Concrete ML and scikit-learn models are 'equivalent'."""
+
+    n_bits = N_BITS_THRESHOLD_FOR_SKLEARN_CORRECTNESS_TESTS
+
     x, y = get_dataset(model_class, parameters, n_bits, load_data, is_weekly_option)
 
-    # Check correctness with sklearn (if we have sufficiently bits of precision)
     if verbose:
         print("Run check_correctness_with_sklearn with fhe='disable'")
 
@@ -1153,15 +1159,16 @@ def test_correctness_with_sklearn(
     )
 
 
-@pytest.mark.parametrize("model_class, parameters", MODELS_AND_DATASETS)
+# Neural network hyper-parameters are not tested
 @pytest.mark.parametrize(
-    "n_bits",
-    N_BITS_WEEKLY_ONLY_BUILDS + N_BITS_REGULAR_BUILDS,
+    "model_class, parameters",
+    get_sklearn_linear_models_and_datasets()
+    + get_sklearn_tree_models_and_datasets()
+    + get_sklearn_neighbors_models_and_datasets(),
 )
 def test_hyper_parameters(
     model_class,
     parameters,
-    n_bits,
     load_data,
     check_r2_score,
     check_accuracy,
@@ -1169,19 +1176,19 @@ def test_hyper_parameters(
     verbose=True,
 ):
     """Testing hyper parameters."""
+
+    n_bits = N_BITS_THRESHOLD_FOR_SKLEARN_CORRECTNESS_TESTS
+
     x, y = get_dataset(model_class, parameters, n_bits, load_data, is_weekly_option)
 
     if verbose:
         print("Run check_hyper_parameters")
 
-    test_correctness_in_clear = True
-
     check_hyper_parameters(
         model_class,
         n_bits,
         x,
         y,
-        test_correctness_in_clear,
         check_r2_score,
         check_accuracy,
     )
@@ -1264,9 +1271,7 @@ def test_serialization(
     verbose=True,
 ):
     """Test Serialization."""
-    # This test only checks the serialization's functionalities, so there is no need to test it
-    # over several n_bits
-    n_bits = min(N_BITS_REGULAR_BUILDS)
+    n_bits = get_n_bits_non_correctness(model_class)
 
     model, x = preamble(model_class, parameters, n_bits, load_data, is_weekly_option)
 
@@ -1338,15 +1343,10 @@ def test_offset(
 
 
 @pytest.mark.parametrize("model_class, parameters", UNIQUE_MODELS_AND_DATASETS)
-@pytest.mark.parametrize(
-    "n_bits",
-    N_BITS_WEEKLY_ONLY_BUILDS + N_BITS_REGULAR_BUILDS,
-)
 @pytest.mark.parametrize("input_type", ["numpy", "torch", "pandas", "list"])
 def test_input_support(
     model_class,
     parameters,
-    n_bits,
     load_data,
     input_type,
     default_configuration,
@@ -1354,6 +1354,8 @@ def test_input_support(
     verbose=True,
 ):
     """Test all models with Pandas, List or Torch inputs."""
+    n_bits = get_n_bits_non_correctness(model_class)
+
     x, y = get_dataset(model_class, parameters, n_bits, load_data, is_weekly_option)
 
     if verbose:
@@ -1362,26 +1364,27 @@ def test_input_support(
     check_input_support(model_class, n_bits, default_configuration, x, y, input_type)
 
 
-@pytest.mark.parametrize("model_class, parameters", UNIQUE_MODELS_AND_DATASETS)
-@pytest.mark.parametrize(
-    "n_bits",
-    N_BITS_WEEKLY_ONLY_BUILDS + N_BITS_REGULAR_BUILDS,
-)
-def test_subfunctions(
+@pytest.mark.parametrize("model_class, parameters", MODELS_AND_DATASETS)
+def test_inference_methods(
     model_class,
     parameters,
-    n_bits,
     load_data,
     is_weekly_option,
+    check_float_array_equal,
+    default_configuration,
     verbose=True,
 ):
-    """Test subfunctions."""
+    """Test inference methods."""
+    n_bits = get_n_bits_non_correctness(model_class)
+
     model, x = preamble(model_class, parameters, n_bits, load_data, is_weekly_option)
 
+    model.compile(x, default_configuration)
+
     if verbose:
-        print("Run check_subfunctions")
+        print("Run check_inference_methods")
 
-    check_subfunctions(model, model_class, x)
+    check_inference_methods(model, model_class, x, check_float_array_equal)
 
 
 # Pipeline test sometimes fails with RandomForest models. This bug may come from Hummingbird
@@ -1419,12 +1422,17 @@ def test_pipeline(
         pytest.param(True, id="simulate"),
     ],
 )
+# N_BITS_LINEAR_MODEL_CRYPTO_PARAMETERS bits is currently the
+# limit to find crypto parameters for linear models
+# make sure we only compile below that bit-width.
+# Additionally, prevent computations in FHE with too many bits
 @pytest.mark.parametrize(
     "n_bits",
     [
-        n
-        for n in N_BITS_WEEKLY_ONLY_BUILDS + N_BITS_REGULAR_BUILDS
-        if n >= N_BITS_THRESHOLD_FOR_PREDICT_CORRECTNESS_TESTS
+        n_bits
+        for n_bits in N_BITS_WEEKLY_ONLY_BUILDS + N_BITS_REGULAR_BUILDS
+        if n_bits
+        < min(N_BITS_LINEAR_MODEL_CRYPTO_PARAMETERS, N_BITS_THRESHOLD_TO_FORCE_EXECUTION_NOT_IN_FHE)
     ],
 )
 # pylint: disable=too-many-branches
@@ -1436,98 +1444,96 @@ def test_predict_correctness(
     load_data,
     default_configuration,
     check_is_good_execution_for_cml_vs_circuit,
-    check_circuit_has_no_tlu,
     is_weekly_option,
-    test_subfunctions_in_fhe=True,
     verbose=True,
 ):
-    """Test correct execution, if there is sufficiently n_bits."""
+    """Test prediction correctness between clear quantized and FHE simulation or execution."""
+
+    # KNN can only be compiled with small quantization bit numbers for now
+    # FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/3979
+    if n_bits > 5 and get_model_name(model_class) == "KNeighborsClassifier":
+        pytest.skip("KNeighborsClassifier models can only run with 5 bits at most.")
 
     model, x = preamble(model_class, parameters, n_bits, load_data, is_weekly_option)
 
-    # How many samples for tests in FHE (i.e., predict with fhe = "execute" or "simulate")
+    # Run the test with more samples during weekly CIs or when using FHE simulation
     if is_weekly_option or simulate:
-        number_of_tests_in_fhe = 5
+        fhe_samples = 5
     else:
-        number_of_tests_in_fhe = 1
+        fhe_samples = 1
 
-    # How many samples for tests in quantized module (i.e., predict with fhe = "disable")
-    if is_weekly_option:
-        number_of_tests_in_non_fhe = 50
-    else:
-        number_of_tests_in_non_fhe = 10
-
-    # Do some inferences in clear
     if verbose:
-        print(
-            "Inference in the clear (with "
-            f"number_of_tests_in_non_fhe = {number_of_tests_in_non_fhe})"
-        )
-    # KNN works only for smaller quantization bits
-    # FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/3979
-    if n_bits > 5 and get_model_name(model) == "KNeighborsClassifier":
-        pytest.skip("Use less than 5 bits with KNN.")
-
-    y_pred = model.predict(x[:number_of_tests_in_non_fhe])
-
-    list_of_possibilities = [False, True]
-
-    # Prevent computations in FHE if too many bits
-    if n_bits >= N_BITS_THRESHOLD_TO_FORCE_EXECUTION_NOT_IN_FHE:
-        list_of_possibilities = [False]
+        print("Compile the model")
 
-    for test_with_execute_in_fhe in list_of_possibilities:
-
-        # N_BITS_LINEAR_MODEL_CRYPTO_PARAMETERS bits is currently the
-        # limit to find crypto parameters for linear models
-        # make sure we only compile below that bit-width.
-        if test_with_execute_in_fhe and not n_bits >= N_BITS_LINEAR_MODEL_CRYPTO_PARAMETERS:
+    model.compile(x, default_configuration)
 
-            if verbose:
-                print("Compile the model")
+    if verbose:
+        print(f"Check prediction correctness for {fhe_samples} samples.")
 
-            with warnings.catch_warnings():
-                fhe_circuit = model.compile(
-                    x,
-                    default_configuration,
-                    show_mlir=verbose and (n_bits <= 8),
-                )
+    # Check prediction correctness between quantized clear and FHE simulation or execution
+    check_is_good_execution_for_cml_vs_circuit(x[:fhe_samples], model=model, simulate=simulate)
 
-                check_properties_of_circuit(model_class, fhe_circuit, check_circuit_has_no_tlu)
 
-            if verbose:
-                print("Compilation done")
+@pytest.mark.parametrize("model_class, parameters", MODELS_AND_DATASETS)
+# Test separated inference steps with new simulation once Concrete Python provides the feature
+# FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/4025
+@pytest.mark.parametrize(
+    "simulate",
+    [
+        pytest.param(False, id="fhe"),
+    ],
+)
+# N_BITS_LINEAR_MODEL_CRYPTO_PARAMETERS bits is currently the
+# limit to find crypto parameters for linear models
+# make sure we only compile below that bit-width.
+# Additionally, prevent computations in FHE with too many bits
+@pytest.mark.parametrize(
+    "n_bits",
+    [
+        n_bits
+        for n_bits in N_BITS_WEEKLY_ONLY_BUILDS + N_BITS_REGULAR_BUILDS
+        if n_bits
+        < min(N_BITS_LINEAR_MODEL_CRYPTO_PARAMETERS, N_BITS_THRESHOLD_TO_FORCE_EXECUTION_NOT_IN_FHE)
+    ],
+)
+# pylint: disable=too-many-branches
+def test_separated_inference(
+    model_class,
+    parameters,
+    simulate,
+    n_bits,
+    load_data,
+    default_configuration,
+    is_weekly_option,
+    check_float_array_equal,
+    verbose=True,
+):
+    """Test prediction correctness between clear quantized and FHE simulation or execution."""
 
-            if verbose:
-                print(
-                    "Run check_is_good_execution_for_cml_vs_circuit "
-                    + f"(with number_of_tests_in_fhe = {number_of_tests_in_fhe})"
-                )
+    # KNN can only be compiled with small quantization bit numbers for now
+    # FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/3979
+    if n_bits > 5 and get_model_name(model_class) == "KNeighborsClassifier":
+        pytest.skip("KNeighborsClassifier models can only run with 5 bits at most.")
 
-            # Check the `predict` method
-            check_is_good_execution_for_cml_vs_circuit(
-                x[:number_of_tests_in_fhe], model=model, simulate=simulate
-            )
+    model, x = preamble(model_class, parameters, n_bits, load_data, is_weekly_option)
 
-            if test_subfunctions_in_fhe and (not simulate):
-                if verbose:
-                    print("Testing subfunctions in FHE")
+    # Run the test with more samples during weekly CIs or when using FHE simulation
+    if is_weekly_option or simulate:
+        fhe_samples = 5
+    else:
+        fhe_samples = 1
 
-                check_subfunctions_in_fhe(model, fhe_circuit, x[:number_of_tests_in_fhe])
+    if verbose:
+        print("Compile the model")
 
-        else:
-            if verbose:
-                print(
-                    "Run predict in fhe='disable' "
-                    f"(with number_of_tests_in_non_fhe = {number_of_tests_in_non_fhe})"
-                )
+    fhe_circuit = model.compile(x, default_configuration)
 
-            # At least, check in clear mode
-            y_pred_fhe = model.predict(x[:number_of_tests_in_non_fhe], fhe="disable")
+    if verbose:
+        print("Run check_separated_inference")
 
-            # Check that the output shape is correct
-            assert y_pred_fhe.shape == y_pred.shape
-            assert numpy.array_equal(y_pred_fhe, y_pred)
+    # Check that separated inference steps (encrypt, run, decrypt, post_processing, ...) are
+    # equivalent to built-in methods (predict, predict_proba, ...)
+    check_separated_inference(model, fhe_circuit, x[:fhe_samples], check_float_array_equal)
 
 
 @pytest.mark.parametrize("model_class, parameters", UNIQUE_MODELS_AND_DATASETS)
@@ -1539,7 +1545,7 @@ def test_fitted_compiled_error_raises(
     verbose=True,
 ):
     """Test Fit and Compile error raises."""
-    n_bits = min(N_BITS_REGULAR_BUILDS)
+    n_bits = get_n_bits_non_correctness(model_class)
 
     x, y = get_dataset(model_class, parameters, n_bits, load_data, is_weekly_option)
 
@@ -1550,6 +1556,8 @@ def test_fitted_compiled_error_raises(
 
 
 @pytest.mark.parametrize("model_class, parameters", MODELS_AND_DATASETS)
+# Enable support for global_p_error testing if possible
+# FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/3297
 @pytest.mark.parametrize(
     "error_param",
     [{"p_error": 0.9999999999990905}],  # 1 - 2**-40
@@ -1564,21 +1572,10 @@ def test_p_error_global_p_error_simulation(
 ):
     """Test p_error and global_p_error simulation.
 
-    Description:
-        A model is compiled with a large p_error. The test then checks the predictions for
-        simulated and fully homomorphic encryption (FHE) inference, and asserts
-        that the predictions for both are different from the expected predictions.
+    The test checks that models compiled with a large p_error value predicts very different results
+    with simulation or in FHE compared to the expected clear quantized ones.
     """
-    # FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/3297
-    if "global_p_error" in error_param:
-        pytest.skip("global_p_error behave very differently depending on the type of model.")
-
-    if get_model_name(model_class) == "KNeighborsClassifier":
-        # KNN works only for smaller quantization bits
-        # FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/3979
-        n_bits = min([2] + N_BITS_REGULAR_BUILDS)
-    else:
-        n_bits = min(N_BITS_REGULAR_BUILDS)
+    n_bits = get_n_bits_non_correctness(model_class)
 
     # Get data-set, initialize and fit the model
     model, x = preamble(model_class, parameters, n_bits, load_data, is_weekly_option)
@@ -1591,14 +1588,16 @@ def test_p_error_global_p_error_simulation(
 
     def check_for_divergent_predictions(x, model, fhe, max_iterations=N_ALLOWED_FHE_RUN):
         """Detect divergence between simulated/FHE execution and clear run."""
+
+        # KNeighborsClassifier does not provide a predict_proba method for now
+        # FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/3962
         predict_function = (
             model.predict_proba
             if is_classifier_or_partial_classifier(model)
-            # `predict_prob` not implemented yet for KNeighborsClassifier
-            # FIXME: https://github.com/zama-ai/concrete-ml-internal/issues/3962
             and get_model_name(model) != "KNeighborsClassifier"
             else model.predict
         )
+
         y_expected = predict_function(x, fhe="disable")
         for i in range(max_iterations):
             y_pred = predict_function(x[i : i + 1], fhe=fhe).ravel()
@@ -1716,3 +1715,32 @@ def test_load_fitted_sklearn_linear_models(
         print("Run check_load_pre_trained_sklearn_models")
 
     check_load_fitted_sklearn_linear_models(model_class, n_bits, x, y)
+
+
+# Only circuits from linear models do not have any TLUs
+@pytest.mark.parametrize("model_class, parameters", get_sklearn_linear_models_and_datasets())
+def test_linear_models_have_no_tlu(
+    model_class,
+    parameters,
+    load_data,
+    is_weekly_option,
+    check_circuit_has_no_tlu,
+    default_configuration,
+    verbose=True,
+):
+    """Test that circuits from linear models have no TLUs."""
+
+    n_bits = min(N_BITS_REGULAR_BUILDS)
+
+    model, x = preamble(model_class, parameters, n_bits, load_data, is_weekly_option)
+
+    if verbose:
+        print("Compile the model")
+
+    fhe_circuit = model.compile(x, default_configuration)
+
+    if verbose:
+        print("Run check_circuit_has_no_tlu")
+
+    # Check that no TLUs are found within the MLIR
+    check_circuit_has_no_tlu(fhe_circuit)