diff --git a/pyproject.toml b/pyproject.toml index 010607a8298..0ecef5030e4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -64,7 +64,7 @@ dependencies = [ "qtpy", "requests", "resfo", - "scipy >= 1.10.1, < 1.15", + "scipy >= 1.15", "seaborn", "tables", # extra dependency for pandas (hdf5) "tabulate", @@ -116,7 +116,6 @@ dev = [ "pytest>6", "resdata", "resfo", - "scikit-learn", "sphinx", "sphinx-argparse", "sphinx-autoapi", diff --git a/src/ert/analysis/misfit_preprocessor.py b/src/ert/analysis/misfit_preprocessor.py index beb6a5e21c7..fd3c15a7ea7 100644 --- a/src/ert/analysis/misfit_preprocessor.py +++ b/src/ert/analysis/misfit_preprocessor.py @@ -34,6 +34,9 @@ def get_nr_primary_components( """ Calculate the number of principal components needed to achieve a cumulative variance less than a specified threshold using Singular Value Decomposition (SVD). + + responses is expected to be on format n x p, where n is number of realizations and + p is number of observations. """ data_matrix = responses - responses.mean(axis=0) _, singulars, _ = np.linalg.svd(data_matrix.astype(float), full_matrices=False) @@ -42,7 +45,7 @@ def get_nr_primary_components( # We compute the cumulative sum of these, then divide by their total sum to get the # cumulative proportion of variance explained by each successive component. variance_ratio = np.cumsum(singulars**2) / np.sum(singulars**2) - return len([1 for i in variance_ratio[:-1] if i < threshold]) + return int(np.argmax(variance_ratio >= threshold)) + 1 def cluster_responses( @@ -53,10 +56,12 @@ def cluster_responses( Cluster responses using hierarchical clustering based on Spearman correlation. Observations that tend to vary similarly across different simulation runs will be clustered together. """ - correlation = spearmanr(responses).statistic - if isinstance(correlation, np.float64): - correlation = np.array([[1, correlation], [correlation, 1]]) - linkage_matrix = linkage(correlation, "average", "euclidean") + distance = 1 - np.abs(spearmanr(responses).statistic) + if isinstance(distance, np.float64): + distance = np.array(distance) + linkage_matrix = linkage( + distance[np.triu_indices(distance.shape[0], k=1)], "average", "euclidean" + ) return fcluster(linkage_matrix, nr_clusters, criterion="maxclust", depth=2) @@ -132,14 +137,14 @@ def main( scale_factors = np.ones(len(obs_errors)) nr_components = np.ones(len(obs_errors), dtype=int) - scaled_responses = responses * obs_errors.reshape(-1, 1) + scaled_responses = responses / obs_errors.reshape(-1, 1) if len(obs_errors) <= 2: # Either observations are not correlated, or only correlated # each other return scale_factors, np.ones(len(obs_errors), dtype=int), nr_components - prim_components = get_nr_primary_components(scaled_responses, threshold=0.95) + prim_components = get_nr_primary_components(scaled_responses.T, threshold=0.95) clusters = cluster_responses(scaled_responses.T, nr_clusters=prim_components) @@ -150,9 +155,8 @@ def main( components = 1 else: components = get_nr_primary_components( - scaled_responses[index], threshold=0.95 + scaled_responses[index].T, threshold=0.95 ) - components = 1 if components == 0 else components scale_factor = get_scaling_factor(len(index), components) nr_components[index] *= components scale_factors[index] *= scale_factor diff --git a/tests/ert/unit_tests/analysis/test_es_update.py b/tests/ert/unit_tests/analysis/test_es_update.py index 2101a97abca..5b080c3a988 100644 --- a/tests/ert/unit_tests/analysis/test_es_update.py +++ b/tests/ert/unit_tests/analysis/test_es_update.py @@ -59,7 +59,6 @@ def obs() -> polars.DataFrame: @pytest.mark.integration_test -@pytest.mark.flaky(reruns=5) @pytest.mark.parametrize( "misfit_preprocess", [[["*"]], [], [["FOPR"]], [["FOPR"], ["WOPR_OP1_1*"]]] ) diff --git a/tests/ert/unit_tests/analysis/test_misfit_preprocessor.py b/tests/ert/unit_tests/analysis/test_misfit_preprocessor.py index 667bb8c2477..f1cbc35b501 100644 --- a/tests/ert/unit_tests/analysis/test_misfit_preprocessor.py +++ b/tests/ert/unit_tests/analysis/test_misfit_preprocessor.py @@ -34,9 +34,10 @@ def test_that_get_nr_primary_components_is_according_to_theory(): # Fast sampling of correlated multivariate observations X = rng.standard_normal(size=(p, N)) Y = (np.linalg.cholesky(R) @ X).T - + Z = Y.copy() Y = StandardScaler().fit_transform(Y) - + Z = (Z - Z.mean(axis=0)) / Z.std(axis=0) + np.testing.assert_almost_equal(Y, Z) lambda_1 = sigma**2 * (1 + (p - 1) * rho) lambda_remaining = sigma**2 * (1 - rho) s1 = np.sqrt(lambda_1 * (N - 1)) @@ -48,9 +49,12 @@ def test_that_get_nr_primary_components_is_according_to_theory(): threshold_3 = (s1**2 + 2 * s_remaining**2) / total # Adding a bit to the thresholds because of numerical accuracy. - assert get_nr_primary_components(Y, threshold_1 + 0.01) == 1 - assert get_nr_primary_components(Y, threshold_2 + 0.01) == 2 - assert get_nr_primary_components(Y, threshold_3 + 0.01) == 3 + assert get_nr_primary_components(Y, threshold_1 + 0.01) == 2 + assert get_nr_primary_components(Y, threshold_2 + 0.01) == 3 + assert get_nr_primary_components(Y, threshold_3 + 0.01) == 4 + + # check that we always return at least 1 + assert get_nr_primary_components(Y, 0) == 1 @pytest.mark.parametrize("nr_observations", [4, 10, 100]) @@ -64,11 +68,11 @@ def test_misfit_preprocessor(nr_observations): nr_realizations = 1000 Y = np.ones((nr_observations, nr_realizations)) parameters_a = rng.standard_normal(nr_realizations) - parameters_b = rng.standard_normal(nr_realizations) + parameters_b = rng.normal(scale=5, size=nr_realizations) for i in range(nr_observations - 1): Y[i] = i + 1 * parameters_a Y[-1] = 5 + 1 * parameters_b - obs_errors = Y.mean(axis=1) + obs_errors = np.array([0.1] * nr_observations) Y_original = Y.copy() obs_error_copy = obs_errors.copy() result, *_ = main(Y, obs_errors)