From 1d06386d986eae227ed8ac442078c45b57da67d5 Mon Sep 17 00:00:00 2001 From: Kevin Klein <7267523+kklein@users.noreply.github.com> Date: Fri, 10 May 2024 12:29:49 +0200 Subject: [PATCH] Reduce test size (#73) --- .github/workflows/ci.yml | 2 +- benchmarks/benchmark.py | 2 +- benchmarks/readme.md | 12 ++++++------ tests/conftest.py | 2 +- tests/test_data_generation.py | 2 +- tests/test_learner.py | 28 +++++++++++++++------------- 6 files changed, 25 insertions(+), 23 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 535d6f8..e3c02b6 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -78,7 +78,7 @@ jobs: uses: quantco/pytest-action@v2 with: report-title: Unit Tests - ${{ matrix.os == 'ubuntu-latest' && 'Linux' || 'Windows' }} - Python ${{ matrix.python-version }} - custom-arguments: -n auto --cov=metalearners --cov-report=xml --cov-report term-missing --color=yes + custom-arguments: --cov=metalearners --cov-report=xml --cov-report term-missing --color=yes - name: Upload coverage reports to Codecov uses: codecov/codecov-action@v4.3.1 with: diff --git a/benchmarks/benchmark.py b/benchmarks/benchmark.py index 81bf8eb..5972719 100644 --- a/benchmarks/benchmark.py +++ b/benchmarks/benchmark.py @@ -38,7 +38,7 @@ def _synthetic_data( is_classification, rng, - sample_size=1_000_000, + sample_size=100_000, n_numericals=25, test_fraction=0.2, propensity_score=0.3, diff --git a/benchmarks/readme.md b/benchmarks/readme.md index c79ae67..2cbc554 100644 --- a/benchmarks/readme.md +++ b/benchmarks/readme.md @@ -25,16 +25,16 @@ on ground truth CATEs: | T-learner | causalml_in_sample | causalml_oos | econml_in_sample | econml_oos | metalearners_in_sample | metalearners_oos | | :----------------------------------------------------------- | -----------------: | -----------: | ---------------: | ---------: | ---------------------: | ---------------: | -| synthetic_data_continuous_outcome_binary_treatment_linear_te | 0.0121381 | 0.01212 | 0.0121381 | 0.01212 | 0.0124729 | 0.01212 | -| synthetic_data_binary_outcome_binary_treatment_linear_te | 0.0149216 | 0.0148903 | nan | nan | 0.0149779 | 0.0148903 | +| synthetic_data_continuous_outcome_binary_treatment_linear_te | 0.0458966 | 0.0456347 | 0.0458966 | 0.0456347 | 0.0467864 | 0.0456347 | +| synthetic_data_binary_outcome_binary_treatment_linear_te | 0.0212419 | 0.0215154 | nan | nan | 0.021512 | 0.0215154 | | twins_pandas | 0.34843 | 0.362315 | nan | nan | 0.354783 | 0.348551 | | twins_numpy | 0.308362 | 0.345602 | nan | nan | 0.349543 | 0.345602 | | S-learner | causalml_in_sample | causalml_oos | econml_in_sample | econml_oos | metalearners_in_sample | metalearners_oos | | :------------------------------------------------------------ | -----------------: | -----------: | ---------------: | ---------: | ---------------------: | ---------------: | -| synthetic_data_continuous_outcome_binary_treatment_linear_te | 11.6777 | 11.6584 | 11.6777 | 11.6585 | 11.6778 | 11.6584 | -| synthetic_data_binary_outcome_binary_treatment_linear_te | 0.256848 | 0.256304 | nan | nan | 0.256859 | 0.256304 | +| synthetic_data_continuous_outcome_binary_treatment_linear_te | 14.5706 | 14.6248 | 14.5706 | 14.6248 | 14.5729 | 14.6248 | +| synthetic_data_binary_outcome_binary_treatment_linear_te | 0.229101 | 0.228616 | nan | nan | 0.229231 | 0.2286 | | twins_pandas | 0.314253 | 0.318554 | nan | nan | 0.371613 | 0.319028 | | twins_numpy | 0.314253 | 0.318554 | nan | nan | 0.361345 | 0.318554 | -| synthetic_data_continuous_outcome_multi_treatment_linear_te | nan | nan | 11.6025 | 11.5957 | 11.6025 | 11.5957 | -| synthetic_data_continuous_outcome_multi_treatment_constant_te | nan | nan | 0.00499728 | 0.00499728 | 0.00384154 | 0.00349068 | +| synthetic_data_continuous_outcome_multi_treatment_linear_te | nan | nan | 14.1468 | 14.185 | 14.1478 | 14.1853 | +| synthetic_data_continuous_outcome_multi_treatment_constant_te | nan | nan | 0.0110779 | 0.0110778 | 0.0104649 | 0.00897915 | diff --git a/tests/conftest.py b/tests/conftest.py index de3beec..1d02f81 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -180,7 +180,7 @@ def n_categoricals(): @pytest.fixture(scope="module") def sample_size(): - return 1_000_000 + return 100_000 @pytest.fixture(scope="function") diff --git a/tests/test_data_generation.py b/tests/test_data_generation.py index 9be477c..2ce272a 100644 --- a/tests/test_data_generation.py +++ b/tests/test_data_generation.py @@ -208,7 +208,7 @@ def test_compute_experiment_outputs( expected_std = np.sqrt(sigma_y**2 + sigma_tau**2) for k in range(1, n_variants): actual_std = (y - mu[:, k])[treatment == k].std() - assert actual_std == pytest.approx(expected_std, abs=1e-2) + assert actual_std == pytest.approx(expected_std, abs=1e-2, rel=1e-1) @pytest.mark.parametrize("dataset", ["numerical_covariates", "mixed_covariates"]) diff --git a/tests/test_learner.py b/tests/test_learner.py index 2378382..ed89fdf 100644 --- a/tests/test_learner.py +++ b/tests/test_learner.py @@ -43,10 +43,10 @@ def _linear_base_learner_params( @pytest.mark.parametrize( "metalearner, outcome_kind, reference_value, treatment_kind, te_kind", [ - ("T", "binary", 0.0149, "binary", "linear"), - ("T", "continuous", 0.0121, "binary", "linear"), - ("S", "binary", 0.2568, "binary", "linear"), - ("S", "continuous", 11.6777, "binary", "linear"), + ("T", "binary", 0.0212, "binary", "linear"), + ("T", "continuous", 0.0456, "binary", "linear"), + ("S", "binary", 0.2290, "binary", "linear"), + ("S", "continuous", 14.5706, "binary", "linear"), ], ) def test_learner_synthetic_in_sample( @@ -84,18 +84,20 @@ def test_learner_synthetic_in_sample( rmse = root_mean_squared_error(true_cate, cate_estimates) assert rmse < reference_value * (1 + _REFERENCE_VALUE_TOLERANCE) if metalearner == "T": - np.testing.assert_allclose(cate_estimates, true_cate.reshape(-1), atol=0.15) + np.testing.assert_allclose( + cate_estimates, true_cate.reshape(-1), atol=0.3, rtol=0.3 + ) @pytest.mark.parametrize( "metalearner, outcome_kind, reference_value, treatment_kind, te_kind", [ - ("T", "binary", 0.0149, "binary", "linear"), - ("T", "continuous", 0.0121, "binary", "linear"), - ("S", "binary", 0.2563, "binary", "linear"), - ("S", "continuous", 11.6584, "binary", "linear"), - ("S", "continuous", 11.5957, "multi", "linear"), - ("S", "continuous", 0.004997, "multi", "constant"), + ("T", "binary", 0.0215, "binary", "linear"), + ("T", "continuous", 0.0456, "binary", "linear"), + ("S", "binary", 0.2286, "binary", "linear"), + ("S", "continuous", 14.6248, "binary", "linear"), + ("S", "continuous", 14.185, "multi", "linear"), + ("S", "continuous", 0.0111, "multi", "constant"), ], ) @pytest.mark.parametrize("oos_method", ["overall", "mean", "median"]) @@ -161,7 +163,7 @@ def test_learner_synthetic_oos( assert rmse < reference_value * (1 + _REFERENCE_VALUE_TOLERANCE) if metalearner == "T": np.testing.assert_allclose( - cate_estimates, true_cate_test.reshape(-1), atol=0.15 + cate_estimates, true_cate_test.reshape(-1), atol=0.3, rtol=0.3 ) @@ -220,7 +222,7 @@ def test_learner_synthetic_oos_ate(metalearner, treatment_kind, oos_method, requ ) actual_ate_estimate = np.mean(cate_estimates) target_ate_estimate = np.mean(true_cate_test) - assert actual_ate_estimate == pytest.approx(target_ate_estimate, abs=1e-2, rel=1e-2) + assert actual_ate_estimate == pytest.approx(target_ate_estimate, abs=1e-2, rel=1e-1) @pytest.mark.parametrize("metalearner, reference_value", [("T", 0.3623), ("S", 0.3186)])